From 06aec46aa40a71fc2c2f0b792b97a25187263179 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Jun 2020 11:44:55 +1000 Subject: [PATCH 001/241] linux-next-post Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7efe0d499c1d90..715774d8c55f93 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,6 +118,7 @@ static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; +static int two_hundred = 200; static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; @@ -2732,7 +2733,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = &two_hundred, }, #ifdef CONFIG_HUGETLB_PAGE { From 5ea7a5fe200ef89c98f452e5344f0df1f659c297 Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Thu, 4 Jun 2020 11:44:56 +1000 Subject: [PATCH 002/241] dynamic_debug: add an option to enable dynamic debug for modules only Instead of enabling dynamic debug globally with CONFIG_DYNAMIC_DEBUG, CONFIG_DYNAMIC_DEBUG_CORE will only enable core function of dynamic debug. With the DYNAMIC_DEBUG_MODULE defined for any modules, dynamic debug will be tied to them. This is useful for people who only want to enable dynamic debug for kernel modules without worrying about kernel image size and memory consumption is increasing too much. Link: http://lkml.kernel.org/r/1586521984-5890-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Acked-by: Greg Kroah-Hartman Cc: Jonathan Corbet Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Baron Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/dynamic-debug-howto.rst | 7 +++++-- include/linux/dev_printk.h | 6 ++++-- include/linux/dynamic_debug.h | 2 +- include/linux/printk.h | 9 ++++++--- lib/Kconfig.debug | 12 ++++++++++++ lib/Makefile | 2 +- lib/dynamic_debug.c | 9 +++++++-- 7 files changed, 36 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5fa..fa5b8d45b34276 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,8 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. -If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just -shortcut for ``print_hex_dump(KERN_DEBUG)``. +If ``CONFIG_DYNAMIC_DEBUG_CORE`` is set, only the modules with ``DEBUG_MODULE`` +defined will be tied into dynamic debug. + +If ``CONFIG_DYNAMIC_DEBUG`` or ``CONFIG_DYNAMIC_DEBUG_CORE`` is not set, +``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is its ``prefix_str`` argument, if it is constant string; or ``hexdump`` diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7bb1..2fb0671152b09c 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de7f..abcd5fde30eb73 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06f0..3f8dea241ddbec 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) #include /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ffcf76279b73aa..ba7a5ca46b254a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DEBUG_MODULE defined for each of them, especially for the case + of embedded system where the kernel image size is sensitive for + people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 3faa033af2f0ff..b1c42c10073b93 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -193,7 +193,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab506..321437bbf87dd3 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; From f81dd828b65bad184bb8b30a5759d297127a3fa6 Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Thu, 4 Jun 2020 11:44:56 +1000 Subject: [PATCH 003/241] dynamic_debug-add-an-option-to-enable-dynamic-debug-for-modules-only-v2 1) Change DEBUG_MODULE to DYNAMIC_DEBUG_MODULE. 2) Change more #if defined(DYNAMIC_DEBUG) condition (in net.h, netdevice.h and ib_verbs.h). 3) Rewrite description in howto document. Link: http://lkml.kernel.org/r/1587408228-10861-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Cc: Greg Kroah-Hartman Cc: Jason Baron Cc: Jonathan Corbet Cc: Petr Mladek Cc: Randy Dunlap Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/dynamic-debug-howto.rst | 10 ++++++---- include/linux/dev_printk.h | 4 ++-- include/linux/net.h | 3 ++- include/linux/netdevice.h | 6 ++++-- include/linux/printk.h | 6 +++--- include/rdma/ib_verbs.h | 6 ++++-- lib/Kconfig.debug | 6 +++--- 7 files changed, 24 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index fa5b8d45b34276..1012bd9305e905 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,11 +13,13 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. -If ``CONFIG_DYNAMIC_DEBUG_CORE`` is set, only the modules with ``DEBUG_MODULE`` -defined will be tied into dynamic debug. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. -If ``CONFIG_DYNAMIC_DEBUG`` or ``CONFIG_DYNAMIC_DEBUG_CORE`` is not set, -``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. +If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just +shortcut for ``print_hex_dump(KERN_DEBUG)``. For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is its ``prefix_str`` argument, if it is constant string; or ``hexdump`` diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 2fb0671152b09c..3028b644b4fbde 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -110,7 +110,7 @@ void _dev_info(const struct device *dev, const char *fmt, ...) _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -183,7 +183,7 @@ do { \ #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a59d..016a9c5faa3479 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36fc..5b364a2e000625 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3f8dea241ddbec..fc8f03c5454302 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -400,7 +400,7 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include /** @@ -537,7 +537,7 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -585,7 +585,7 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif #if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DEBUG_MODULE)) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c35..ef2f3986c49334 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ba7a5ca46b254a..5f32626d08eae1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -173,9 +173,9 @@ config DYNAMIC_DEBUG_CORE help Enable core functional support of dynamic debug. It is useful when you want to tie dynamic debug to your kernel modules with - DEBUG_MODULE defined for each of them, especially for the case - of embedded system where the kernel image size is sensitive for - people. + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" From 62c0c02570d35359e0d7a61c6cf70f1e6b06700c Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 4 Jun 2020 11:44:57 +1000 Subject: [PATCH 004/241] kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Rafael Aquini Suggested-by: Qian Cai Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kdump/kdump.rst | 8 +++++ .../admin-guide/kernel-parameters.txt | 13 +++++++ Documentation/admin-guide/sysctl/kernel.rst | 7 ++++ include/linux/kernel.h | 3 ++ kernel/panic.c | 34 +++++++++++++++++++ kernel/sysctl.c | 11 +++++- 6 files changed, 75 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d293503..2da65fef2a1c63 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09b697e7e14fb1..daaa7db4314028 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally call panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d0114110..3b00b92231578c 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1239,6 +1239,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9b7a8d74a9d629..f7835db7102eaf 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -528,6 +528,8 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; @@ -596,6 +598,7 @@ extern enum system_states { #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index 1cfb47d996d843..7e62535ac4bd12 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -688,3 +695,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f93..587ed0494f2f62 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); From 2e4b64f029b414211cc40facda55e32cef07bdf9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Jun 2020 11:44:58 +1000 Subject: [PATCH 005/241] kernel-add-panic_on_taint-fix tweak kernel-parameters.txt wording Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Qian Cai Cc: Rafael Aquini Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index daaa7db4314028..55781f80696ddc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3447,7 +3447,7 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer - panic_on_taint= Bitmask for conditionally call panic() in add_taint() + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is From 18c47a86f7d962e3d64f30c43d66b8a50e9866b4 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 4 Jun 2020 11:44:59 +1000 Subject: [PATCH 006/241] xarray.h: correct return code documentation for xa_store_{bh,irq}() __xa_store() and xa_store() document that the functions can fail, and that the return code can be an xa_err() encoded error code. xa_store_bh() and xa_store_irq() do not document that the functions can fail and that they can also return xa_err() encoded error codes. Thus: Update the documentation. Link: http://lkml.kernel.org/r/20200430111424.16634-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/xarray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 3b213c6d17fbe5..8ce9d63d813bad 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -576,7 +576,7 @@ void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) @@ -602,7 +602,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) From b880cc2f158c63a4028d6b4e9af6618c77cd287b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:44:59 +1000 Subject: [PATCH 007/241] kernel/sysctl: support setting sysctl parameters from kernel command line Patch series "support setting sysctl parameters from kernel command line", v3. This series adds support for something that seems like many people always wanted but nobody added it yet, so here's the ability to set sysctl parameters via kernel command line options in the form of sysctl.vm.something=1 The important part is Patch 1. The second, not so important part is an attempt to clean up legacy one-off parameters that do the same thing as a sysctl. I don't want to remove them completely for compatibility reasons, but with generic sysctl support the idea is to remove the one-off param handlers and treat the parameters as aliases for the sysctl variants. I have identified several parameters that mention sysctl counterparts in Documentation/admin-guide/kernel-parameters.txt but there might be more. The conversion also has varying level of success: - numa_zonelist_order is converted in Patch 2 together with adding the necessary infrastructure. It's easy as it doesn't really do anything but warn on deprecated value these days. - hung_task_panic is converted in Patch 3, but there's a downside that now it only accepts 0 and 1, while previously it was any integer value - nmi_watchdog maps to two sysctls nmi_watchdog and hardlockup_panic, so there's no straighforward conversion possible - traceoff_on_warning is a flag without value and it would be required to handle that somehow in the conversion infractructure, which seems pointless for a single flag This patch (of 5): A recently proposed patch to add vm_swappiness command line parameter in addition to existing sysctl [1] made me wonder why we don't have a general support for passing sysctl parameters via command line. Googling found only somebody else wondering the same [2], but I haven't found any prior discussion with reasons why not to do this. Settings the vm_swappiness issue aside (the underlying issue might be solved in a different way), quick search of kernel-parameters.txt shows there are already some that exist as both sysctl and kernel parameter - hung_task_panic, nmi_watchdog, numa_zonelist_order, traceoff_on_warning. A general mechanism would remove the need to add more of those one-offs and might be handy in situations where configuration by e.g. /etc/sysctl.d/ is impractical. Hence, this patch adds a new parse_args() pass that looks for parameters prefixed by 'sysctl.' and tries to interpret them as writes to the corresponding sys/ files using an temporary in-kernel procfs mount. This mechanism was suggested by Eric W. Biederman [3], as it handles all dynamically registered sysctl tables, even though we don't handle modular sysctls. Errors due to e.g. invalid parameter name or value are reported in the kernel log. The processing is hooked right before the init process is loaded, as some handlers might be more complicated than simple setters and might need some subsystems to be initialized. At the moment the init process can be started and eventually execute a process writing to /proc/sys/ then it should be also fine to do that from the kernel. Sysctls registered later on module load time are not set by this mechanism - it's expected that in such scenarios, setting sysctl values from userspace is practical enough. [1] https://lore.kernel.org/r/BL0PR02MB560167492CA4094C91589930E9FC0@BL0PR02MB5601.namprd02.prod.outlook.com/ [2] https://unix.stackexchange.com/questions/558802/how-to-set-sysctl-using-kernel-command-line-parameter [3] https://lore.kernel.org/r/87bloj2skm.fsf@x220.int.ebiederm.org/ Link: http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Link: http://lkml.kernel.org/r/20200427180433.7029-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Luis Chamberlain Reviewed-by: Masami Hiramatsu Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: "Eric W . Biederman" Cc: "Guilherme G . Piccoli" Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/kernel-parameters.txt | 9 ++ fs/proc/proc_sysctl.c | 107 ++++++++++++++++++ include/linux/sysctl.h | 4 + init/main.c | 2 + 4 files changed, 122 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 55781f80696ddc..401e77c596ec81 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4971,6 +4971,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index df2143e05c571e..973acf96f37c8a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1703,3 +1704,109 @@ int __init proc_sys_init(void) return sysctl_init(); } + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) + return 0; + + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f2401e45a3c2bc..50bb7f383a1b5d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +void do_sysctl_args(void); extern int pwrsw_enabled; extern int unaligned_enabled; @@ -235,6 +236,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } +static inline void do_sysctl_args(void) +{ +} #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, diff --git a/init/main.c b/init/main.c index 30271bcb30de9b..0ead83e86b5aa2 100644 --- a/init/main.c +++ b/init/main.c @@ -1414,6 +1414,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) From 0972522eda22022f122de2112d88e2050083cc96 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:45:00 +1000 Subject: [PATCH 008/241] kernel/sysctl: support handling command line aliases We can now handle sysctl parameters on kernel command line, but historically some parameters introduced their own command line equivalent, which we don't want to remove for compatibility reasons. We can however convert them to the generic infrastructure with a table translating the legacy command line parameters to their sysctl names, and removing the one-off param handlers. This patch adds the support and makes the first conversion to demonstrate it, on the (deprecated) numa_zonelist_order parameter. Link: http://lkml.kernel.org/r/20200427180433.7029-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Luis Chamberlain Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- fs/proc/proc_sysctl.c | 48 ++++++++++++++++++++++++++++++++++++------- mm/page_alloc.c | 9 -------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 973acf96f37c8a..124298168f8b7a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1705,6 +1705,37 @@ int __init proc_sys_init(void) return sysctl_init(); } +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) @@ -1718,15 +1749,18 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; - if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) - return 0; - - param += sizeof("sysctl") - 1; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; - if (param[0] != '/' && param[0] != '.') - return 0; + if (param[0] != '/' && param[0] != '.') + return 0; - param++; + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } /* * To set sysctl options, we use a temporary mount of proc, look up the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150e7719b1b538..0c435b2ed665c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5587,15 +5587,6 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* From d92452f2e7329bb61d1a9136534e08800513550f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:45:01 +1000 Subject: [PATCH 009/241] kernel/hung_task convert hung_task_panic boot parameter to sysctl We can now handle sysctl parameters on kernel command line and have infrastructure to convert legacy command line options that duplicate sysctl to become a sysctl alias. This patch converts the hung_task_panic parameter. Note that the sysctl handler is more strict and allows only 0 and 1, while the legacy parameter allowed any non-zero value. But there is little reason anyone would not be using 1. Link: http://lkml.kernel.org/r/20200427180433.7029-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/kernel-parameters.txt | 2 +- fs/proc/proc_sysctl.c | 1 + kernel/hung_task.c | 10 ---------- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 401e77c596ec81..309068270dde94 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1515,7 +1515,7 @@ [KNL] Should the hung task detector generate panics. Format: - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 124298168f8b7a..15030784566c73 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1721,6 +1721,7 @@ struct sysctl_alias { */ static const struct sysctl_alias sysctl_aliases[] = { {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"hung_task_panic", "kernel.hung_task_panic" }, { } }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 69f54848af7904..7e487d6c61db0f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,16 +63,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { From 541ca5e8ebeedc1831bc94f0765824188b41c524 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:45:02 +1000 Subject: [PATCH 010/241] tools/testing/selftests/sysctl/sysctl.sh: support CONFIG_TEST_SYSCTL=y The testing script recommends CONFIG_TEST_SYSCTL=y, but actually only works with CONFIG_TEST_SYSCTL=m. Testing of sysctl setting via boot param however requires the test to be built-in, so make sure the test script supports it. Link: http://lkml.kernel.org/r/20200427180433.7029-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Luis Chamberlain Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/sysctl/sysctl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index c3459f9f2429be..bea178173cd8f0 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -112,7 +112,7 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" echo "You must set CONFIG_TEST_SYSCTL=m in your kernel" >&2 From fdbfd8ff679246fdb29d8da163171710ee535027 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:45:03 +1000 Subject: [PATCH 011/241] lib/test_sysctl: support testing of sysctl. boot parameter Testing is done by a new parameter debug.test_sysctl.boot_int which defaults to 0 and it's expected that the tester passes a boot parameter that sets it to 1. The test checks if it's set to 1. To distinguish true failure from parameter not being set, the test checks /proc/cmdline for the expected parameter, and whether test_sysctl is built-in and not a module. Link: http://lkml.kernel.org/r/20200427180433.7029-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- lib/test_sysctl.c | 13 +++++++++ tools/testing/selftests/sysctl/sysctl.sh | 36 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index ec4d0f03475d74..98bc92a916620e 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -44,6 +44,8 @@ struct test_sysctl_data { int int_0002; int int_0003[4]; + int boot_int; + unsigned int uint_0001; char string_0001[65]; @@ -61,6 +63,8 @@ static struct test_sysctl_data test_data = { .int_0003[2] = 2, .int_0003[3] = 3, + .boot_int = 0, + .uint_0001 = 314, .string_0001 = "(none)", @@ -91,6 +95,15 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "boot_int", + .data = &test_data.boot_int, + .maxlen = sizeof(test_data.boot_int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "uint_0001", .data = &test_data.uint_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index bea178173cd8f0..f9c593da3344b3 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,6 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" function allow_user_defaults() { @@ -744,6 +745,40 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return 0 + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" +} + list_tests() { echo "Test ID list:" @@ -758,6 +793,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() From 25d63a06c5c0d4d12954636d47610a89984fbeb5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 4 Jun 2020 11:45:03 +1000 Subject: [PATCH 012/241] lib-test_sysctl-support-testing-of-sysctl-boot-parameter-fix Skip the new test if boot_int sysctl is not present, otherwise, per Luis, "This would fail if someone uses this script to test an older kernel, and the scripts in selftests are supposed to work with older kernels." Link: http://lkml.kernel.org/r/305af605-1e60-cf84-fada-6ce1ca37c102@suse.cz Signed-off-by: Vlastimil Babka Suggested-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- tools/testing/selftests/sysctl/sysctl.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index f9c593da3344b3..19515dcb7d04e4 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -748,10 +748,15 @@ sysctl_test_0006() sysctl_test_0007() { TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + if [ -d $DIR ]; then echo "Boot param test only possible sysctl_test is built-in, not module:" cat $TEST_DIR/config >&2 - return 0 + return $ksft_skip fi echo -n "Testing if $TARGET is set to 1 ..." @@ -777,6 +782,7 @@ sysctl_test_0007() echo "Skipping test, expected kernel parameter missing." echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip } list_tests() From d751578171f56f078e9266919e418f0cda139606 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 4 Jun 2020 11:45:04 +1000 Subject: [PATCH 013/241] kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases After a recent change introduced by Vlastimil's series [0], kernel is able now to handle sysctl parameters on kernel command line; also, the series introduced a simple infrastructure to convert legacy boot parameters (that duplicate sysctls) into sysctl aliases. This patch converts the watchdog parameters softlockup_panic and {hard,soft}lockup_all_cpu_backtrace to use the new alias infrastructure. It fixes the documentation too, since the alias only accepts values 0 or 1, not the full range of integers. We also took the opportunity here to improve the documentation of the previously converted hung_task_panic (see the patch series [0]) and put the alias table in alphabetical order. [0] http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Link: http://lkml.kernel.org/r/20200507214624.21911-1-gpiccoli@canonical.com Signed-off-by: Guilherme G. Piccoli Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- .../admin-guide/kernel-parameters.txt | 10 ++--- fs/proc/proc_sysctl.c | 7 +++- kernel/watchdog.c | 38 +++++-------------- 3 files changed, 19 insertions(+), 36 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 309068270dde94..746030ae3b124e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,7 +1513,7 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: + Format: 0 | 1 A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled @@ -4667,9 +4667,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4678,7 +4678,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 15030784566c73..5b405f32971d64 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,8 +1720,11 @@ struct sysctl_alias { * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { - {"numa_zonelist_order", "vm.numa_zonelist_order" }, - {"hung_task_panic", "kernel.hung_task_panic" }, + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + {"softlockup_panic", "kernel.softlockup_panic" }, { } }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fa5aacbfd0001a..1b939532fcc1b8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,17 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ - atomic_t hardlockup_detected = ATOMIC_INIT(0); static inline void flush_hardlockup_messages(void) @@ -183,6 +177,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -198,13 +196,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -226,17 +217,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* From f91b47e83ec6c436f5d4ac9a5542a86bae86bb7e Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 4 Jun 2020 11:45:05 +1000 Subject: [PATCH 014/241] kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Guilherme G. Piccoli Reviewed-by: Kees Cook Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 14 ++++++++++++++ include/linux/sched/sysctl.h | 7 +++++++ kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b00b92231578c..861820d27c1925 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7b4d3a49b6c574..660ac49f2b531d 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,6 +7,13 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK + +#ifdef CONFIG_SMP +extern unsigned int sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 7e487d6c61db0f..a672db830a946e 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -235,10 +247,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f62..34c1278951b994 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, From 0d9b615530e34b5b1113caed37f5faa8bfb1ed8e Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 4 Jun 2020 11:45:06 +1000 Subject: [PATCH 015/241] panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Guilherme G. Piccoli Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/admin-guide/sysctl/kernel.rst | 16 ++++++++++++++++ include/linux/kernel.h | 6 ++++++ kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 44 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 861820d27c1925..83acf50254886b 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -646,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7835db7102eaf..82d91547d122d5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -520,6 +520,12 @@ static inline u32 int_sqrt64(u64 x) } #endif +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; diff --git a/kernel/panic.c b/kernel/panic.c index 7e62535ac4bd12..e2157ca387c832 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b994..f69d581d39c31f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, From 319a871115e3fa7136ff823ece71f5e6e4291e34 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Thu, 4 Jun 2020 11:45:07 +1000 Subject: [PATCH 016/241] kernel/sysctl.c: ignore out-of-range taint bits introduced via kernel.tainted Users with SYS_ADMIN capability can add arbitrary taint flags to the running kernel by writing to /proc/sys/kernel/tainted or issuing the command 'sysctl -w kernel.tainted=...'. This interface, however, is open for any integer value and this might cause an invalid set of flags being committed to the tainted_mask bitset. This patch introduces a simple way for proc_taint() to ignore any eventual invalid bit coming from the user input before committing those bits to the kernel tainted_mask. Link: http://lkml.kernel.org/r/20200512223946.888020-1-aquini@redhat.com Signed-off-by: Rafael Aquini Reviewed-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f69d581d39c31f..db1ce7af25632d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -880,10 +880,9 @@ static int proc_taint(struct ctl_table *table, int write, * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; From cf9426e993edab9aa37942cc46696c7d2fd05622 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Thu, 4 Jun 2020 11:45:07 +1000 Subject: [PATCH 017/241] stacktrace: cleanup inconsistent variable type Modify the variable type of 'skip' member of struct stack_trace. In theory, the 'skip' variable type should be unsigned int. There are two reasons: - The 'skip' only has two situation, 1)Positive value, 2)Zero - The 'skip' of struct stack_trace has inconsistent type with struct stack_trace_data, it makes a bit confusion in the relationship between struct stack_trace and stack_trace_data. Link: http://lkml.kernel.org/r/20200421013511.5960-1-walter-zh.wu@mediatek.com Signed-off-by: Walter Wu Reviewed-by: Bart Van Assche Cc: Matthias Brugger Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/stacktrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 83bd8cb475d7e5..b7af8cc13eda4f 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -64,7 +64,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; - int skip; /* input argument: How many entries to skip */ + unsigned int skip; /* input argument: How many entries to skip */ }; extern void save_stack_trace(struct stack_trace *trace); From 50e3ef91512743522caef0439f685cdd3667db15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:08 +1000 Subject: [PATCH 018/241] amdgpu: a NULL ->mm does not mean a thread is a kthread Use the proper API instead. Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de Link: http://lkml.kernel.org/r/20200404094101.672954-2-hch@lst.de Fixes: 70539bd795002 ("drm/amd: Update MEC HQD loading code for KFD") Signed-off-by: Christoph Hellwig Reviewed-by: Felix Kuehling Reviewed-by: Jens Axboe Tested-by: Jens Axboe Cc: Al Viro Cc: Alex Deucher Cc: Zhenyu Wang Cc: Zhi Wang Cc: Felipe Balbi Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3f2b695cf19e2a..dc99318a5b3d68 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -195,7 +195,7 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s pagefault_disable(); \ if ((mmptr) == current->mm) { \ valid = !get_user((dst), (wptr)); \ - } else if (current->mm == NULL) { \ + } else if (current->flags & PF_KTHREAD) { \ use_mm(mmptr); \ valid = !get_user((dst), (wptr)); \ unuse_mm(mmptr); \ From 8409dd60be8052f2c22c448ffe8600c6add056d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:09 +1000 Subject: [PATCH 019/241] kernel: move use_mm/unuse_mm to kthread.c Patch series "improve use_mm / unuse_mm", v2. This series improves the use_mm / unuse_mm interface by better documenting the assumptions, and my taking the set_fs manipulations spread over the callers into the core API. This patch (of 3): Use the proper API instead. Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de These helpers are only for use with kernel threads, and I will tie them more into the kthread infrastructure going forward. Also move the prototypes to kthread.h - mmu_context.h was a little weird to start with as it otherwise contains very low-level MM bits. Link: http://lkml.kernel.org/r/20200404094101.672954-1-hch@lst.de Link: http://lkml.kernel.org/r/20200416053158.586887-1-hch@lst.de Link: http://lkml.kernel.org/r/20200404094101.672954-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Jens Axboe Tested-by: Jens Axboe Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 1 - .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 1 - .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 2 - .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 2 - .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 2 - drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- drivers/vhost/vhost.c | 1 - fs/aio.c | 1 - fs/io-wq.c | 1 - fs/io_uring.c | 1 - include/linux/kthread.h | 5 ++ include/linux/mmu_context.h | 5 -- kernel/kthread.c | 56 ++++++++++++++++ mm/Makefile | 2 +- mm/mmu_context.c | 64 ------------------- 18 files changed, 66 insertions(+), 85 deletions(-) delete mode 100644 mm/mmu_context.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index dc99318a5b3d68..16c215dce8fa2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c index 6529caca88fe15..35d4a5ab022808 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include "amdgpu.h" #include "amdgpu_amdkfd.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 691c89705bcdc0..bf927f432506dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -19,7 +19,6 @@ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -#include #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "gc/gc_10_1_0_offset.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 0b7e7874854068..7d01420c0c85e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -20,8 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include - #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "cikd.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index ccd635b812b550..635cd1a26bed56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -20,8 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include - #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "gfx_v8_0.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index df841c2ac5e741..c7fd0c47b25453 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -19,8 +19,6 @@ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ -#include - #include "amdgpu.h" #include "amdgpu_amdkfd.h" #include "gc/gc_9_0_offset.h" diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index eee530453aa677..ad8a9df49f295f 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 494f853f2206a1..7ae54b7b637ba8 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 3afddd3bea6e71..20fba95ed0a65f 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index abdba14770cbeb..fa93f2c76c5cab 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/aio.c b/fs/aio.c index e0c38bbd154666..e35c22fb76665c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/io-wq.c b/fs/io-wq.c index 4023c984686086..5f590bf27bffd9 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/io_uring.c b/fs/io_uring.c index 95747c9115fc85..f4dc707f14be0b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c2a274b79c429e..1e4dc4c4c48658 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -5,6 +5,8 @@ #include #include +struct mm_struct; + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -199,6 +201,9 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +void use_mm(struct mm_struct *mm); +void unuse_mm(struct mm_struct *mm); + struct cgroup_subsys_state; #ifdef CONFIG_BLK_CGROUP diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index d9a543a9e1ccec..c51a84132d7c08 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -4,11 +4,6 @@ #include -struct mm_struct; - -void use_mm(struct mm_struct *mm); -void unuse_mm(struct mm_struct *mm); - /* Architectures that care about IRQ state in switch_mm can override this. */ #ifndef switch_mm_irqs_off # define switch_mm_irqs_off switch_mm diff --git a/kernel/kthread.c b/kernel/kthread.c index b84fc7eec0358e..8203d9631d964c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include +#include +#include #include +#include #include #include #include @@ -25,6 +29,7 @@ #include #include + static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -1220,6 +1225,57 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + mmgrab(mm); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != mm) + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + sync_mm_rss(mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); + #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread diff --git a/mm/Makefile b/mm/Makefile index fa91e963c2f9e6..6e9d46b2efc9a0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -49,7 +49,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ - mm_init.o mmu_context.o percpu.o slab_common.o \ + mm_init.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o $(mmu-y) diff --git a/mm/mmu_context.c b/mm/mmu_context.c deleted file mode 100644 index 3e612ae748e966..00000000000000 --- a/mm/mmu_context.c +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (C) 2009 Red Hat, Inc. - * - * See ../COPYING for licensing terms. - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -void use_mm(struct mm_struct *mm) -{ - struct mm_struct *active_mm; - struct task_struct *tsk = current; - - task_lock(tsk); - active_mm = tsk->active_mm; - if (active_mm != mm) { - mmgrab(mm); - tsk->active_mm = mm; - } - tsk->mm = mm; - switch_mm(active_mm, mm, tsk); - task_unlock(tsk); -#ifdef finish_arch_post_lock_switch - finish_arch_post_lock_switch(); -#endif - - if (active_mm != mm) - mmdrop(active_mm); -} -EXPORT_SYMBOL_GPL(use_mm); - -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -void unuse_mm(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - sync_mm_rss(mm); - tsk->mm = NULL; - /* active_mm is still 'mm' */ - enter_lazy_tlb(mm, tsk); - task_unlock(tsk); -} -EXPORT_SYMBOL_GPL(unuse_mm); From 599cae232df6195b7017ec143b0d930abfb33889 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:09 +1000 Subject: [PATCH 020/241] kernel: move use_mm/unuse_mm to kthread.c cover the newly merged use_mm/unuse_mm caller in vfio Link: http://lkml.kernel.org/r/20200416053158.586887-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jens Axboe Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 391fafe82c5c06..b42fec87d8cb16 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include From 3e8a9167bd209b96b78a14cd157caa94ece9857a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:10 +1000 Subject: [PATCH 021/241] kernel: better document the use_mm/unuse_mm API contract Switch the function documentation to kerneldoc comments, and add WARN_ON_ONCE asserts that the calling thread is a kernel thread and does not have ->mm set (or has ->mm set in the case of unuse_mm). Also give the functions a kthread_ prefix to better document the use case. Link: http://lkml.kernel.org/r/20200404094101.672954-6-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Jens Axboe Tested-by: Jens Axboe Acked-by: Greg Kroah-Hartman [usb] Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 +-- drivers/usb/gadget/function/f_fs.c | 4 +-- drivers/usb/gadget/legacy/inode.c | 4 +-- drivers/vhost/vhost.c | 4 +-- fs/io-wq.c | 6 ++-- fs/io_uring.c | 4 +-- include/linux/kthread.h | 4 +-- kernel/kthread.c | 33 +++++++++++----------- mm/oom_kill.c | 6 ++-- mm/vmacache.c | 4 +-- 10 files changed, 36 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 16c215dce8fa2f..2affc6a2fea8ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -197,9 +197,9 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev *s if ((mmptr) == current->mm) { \ valid = !get_user((dst), (wptr)); \ } else if (current->flags & PF_KTHREAD) { \ - use_mm(mmptr); \ + kthread_use_mm(mmptr); \ valid = !get_user((dst), (wptr)); \ - unuse_mm(mmptr); \ + kthread_unuse_mm(mmptr); \ } \ pagefault_enable(); \ } \ diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 7ae54b7b637ba8..f80b2747d7c57e 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -827,9 +827,9 @@ static void ffs_user_copy_worker(struct work_struct *work) mm_segment_t oldfs = get_fs(); set_fs(USER_DS); - use_mm(io_data->mm); + kthread_use_mm(io_data->mm); ret = ffs_copy_to_iter(io_data->buf, ret, &io_data->data); - unuse_mm(io_data->mm); + kthread_unuse_mm(io_data->mm); set_fs(oldfs); } diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 20fba95ed0a65f..9ee0bfe7bcdaea 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -462,9 +462,9 @@ static void ep_user_copy_worker(struct work_struct *work) struct kiocb *iocb = priv->iocb; size_t ret; - use_mm(mm); + kthread_use_mm(mm); ret = copy_to_iter(priv->buf, priv->actual, &priv->to); - unuse_mm(mm); + kthread_unuse_mm(mm); if (!ret) ret = -EFAULT; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index fa93f2c76c5cab..7c29493ff3f1e4 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -339,7 +339,7 @@ static int vhost_worker(void *data) mm_segment_t oldfs = get_fs(); set_fs(USER_DS); - use_mm(dev->mm); + kthread_use_mm(dev->mm); for (;;) { /* mb paired w/ kthread_stop */ @@ -367,7 +367,7 @@ static int vhost_worker(void *data) schedule(); } } - unuse_mm(dev->mm); + kthread_unuse_mm(dev->mm); set_fs(oldfs); return 0; } diff --git a/fs/io-wq.c b/fs/io-wq.c index 5f590bf27bffd9..748621f7391ec6 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -170,7 +170,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) } __set_current_state(TASK_RUNNING); set_fs(KERNEL_DS); - unuse_mm(worker->mm); + kthread_unuse_mm(worker->mm); mmput(worker->mm); worker->mm = NULL; } @@ -417,7 +417,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) { if (worker->mm) { - unuse_mm(worker->mm); + kthread_unuse_mm(worker->mm); mmput(worker->mm); worker->mm = NULL; } @@ -426,7 +426,7 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) return; } if (mmget_not_zero(work->mm)) { - use_mm(work->mm); + kthread_use_mm(work->mm); if (!worker->mm) set_fs(USER_DS); worker->mm = work->mm; diff --git a/fs/io_uring.c b/fs/io_uring.c index f4dc707f14be0b..1c0697433f28ed 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6008,7 +6008,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[req->opcode].needs_mm && !current->mm) { if (unlikely(!mmget_not_zero(ctx->sqo_mm))) return -EFAULT; - use_mm(ctx->sqo_mm); + kthread_use_mm(ctx->sqo_mm); } sqe_flags = READ_ONCE(sqe->flags); @@ -6118,7 +6118,7 @@ static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) struct mm_struct *mm = current->mm; if (mm) { - unuse_mm(mm); + kthread_unuse_mm(mm); mmput(mm); } } diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 1e4dc4c4c48658..65b81e0c494d20 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -201,8 +201,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); -void use_mm(struct mm_struct *mm); -void unuse_mm(struct mm_struct *mm); +void kthread_use_mm(struct mm_struct *mm); +void kthread_unuse_mm(struct mm_struct *mm); struct cgroup_subsys_state; diff --git a/kernel/kthread.c b/kernel/kthread.c index 8203d9631d964c..02635ad4d91699 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1225,18 +1225,18 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -/* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_use_mm - make the calling kthread operate on an address space + * @mm: address space to operate on */ -void use_mm(struct mm_struct *mm) +void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(tsk->mm); + task_lock(tsk); active_mm = tsk->active_mm; if (active_mm != mm) { @@ -1253,20 +1253,19 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } -EXPORT_SYMBOL_GPL(use_mm); +EXPORT_SYMBOL_GPL(kthread_use_mm); -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) +/** + * kthread_use_mm - reverse the effect of kthread_use_mm() + * @mm: address space to operate on */ -void unuse_mm(struct mm_struct *mm) +void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; + WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); + WARN_ON_ONCE(!tsk->mm); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; @@ -1274,7 +1273,7 @@ void unuse_mm(struct mm_struct *mm) enter_lazy_tlb(mm, tsk); task_unlock(tsk); } -EXPORT_SYMBOL_GPL(unuse_mm); +EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4daedf7b91f67b..463b3d74a64a28 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -126,7 +126,7 @@ static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) /* * The process p may have detached its own ->mm while exiting or through - * use_mm(), but one or more of its subthreads may still have a valid + * kthread_use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with * task_lock() held. */ @@ -919,8 +919,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; } /* - * No use_mm() user needs to read from the userspace so we are - * ok to reap it. + * No kthead_use_mm() user needs to read from the userspace so + * we are ok to reap it. */ if (unlikely(p->flags & PF_KTHREAD)) continue; diff --git a/mm/vmacache.c b/mm/vmacache.c index cdc32a3b02fa0d..ceedbab82106ea 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -25,8 +25,8 @@ * task's vmacache pertains to a different mm (ie, its own). There is * nothing we can do here. * - * Also handle the case where a kernel thread has adopted this mm via use_mm(). - * That kernel thread's vmacache is not applicable to this mm. + * Also handle the case where a kernel thread has adopted this mm via + * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. */ static inline bool vmacache_valid_mm(struct mm_struct *mm) { From 597211ba8c3677ac3e9153e1c86ff48eac4b80a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:11 +1000 Subject: [PATCH 022/241] kernel-better-document-the-use_mm-unuse_mm-api-contract-v2 fix a comment typo, cover the newly merged use_mm/unuse_mm caller in vfio Link: http://lkml.kernel.org/r/20200416053158.586887-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Jason Wang Cc: Jens Axboe Cc: "Michael S. Tsirkin" Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/vfio/vfio_iommu_type1.c | 4 ++-- kernel/kthread.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index b42fec87d8cb16..f7a2b7557d4a67 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2817,7 +2817,7 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, return -EPERM; if (kthread) - use_mm(mm); + kthread_use_mm(mm); else if (current->mm != mm) goto out; @@ -2844,7 +2844,7 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, *copied = copy_from_user(data, (void __user *)vaddr, count) ? 0 : count; if (kthread) - unuse_mm(mm); + kthread_unuse_mm(mm); out: mmput(mm); return *copied ? 0 : -EFAULT; diff --git a/kernel/kthread.c b/kernel/kthread.c index 02635ad4d91699..f4373cca41b011 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1256,7 +1256,7 @@ void kthread_use_mm(struct mm_struct *mm) EXPORT_SYMBOL_GPL(kthread_use_mm); /** - * kthread_use_mm - reverse the effect of kthread_use_mm() + * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) From 6cbe57e16797b80e84358907298171c2a1542dca Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 4 Jun 2020 11:45:11 +1000 Subject: [PATCH 023/241] powerpc/vas: fix up for {un}use_mm() rename Link: http://lkml.kernel.org/r/20200422163935.5aa93ba5@canb.auug.org.au Signed-off-by: Stephen Rothwell Acked-by: Haren Myneni Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/platforms/powernv/vas-fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index 25db70be4c9ce8..266a6ca5e15e67 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -127,7 +127,7 @@ static void update_csb(struct vas_window *window, return; } - use_mm(window->mm); + kthread_use_mm(window->mm); rc = copy_to_user(csb_addr, &csb, sizeof(csb)); /* * User space polls on csb.flags (first byte). So add barrier @@ -139,7 +139,7 @@ static void update_csb(struct vas_window *window, smp_mb(); rc = copy_to_user(csb_addr, &csb, sizeof(u8)); } - unuse_mm(window->mm); + kthread_unuse_mm(window->mm); put_task_struct(tsk); /* Success */ From fdedb173a64af7687bfcf21fc899fea2f0685293 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 4 Jun 2020 11:45:12 +1000 Subject: [PATCH 024/241] drm/amdkfd: fix up for {un}use_mm() rename Link: http://lkml.kernel.org/r/20200528201940.759c58ff@canb.auug.org.au Signed-off-by: Stephen Rothwell Cc: Alex Deucher Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a9a7f5aa2710a9..bdc58741b32e68 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -115,7 +115,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work) return; } - use_mm(mm); + kthread_use_mm(mm); dqm_lock(dqm); @@ -142,7 +142,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work) } dqm_unlock(dqm); - unuse_mm(mm); + kthread_unuse_mm(mm); mmput(mm); } From e37fb880cd21ada7cce48d7982ee33ced9bd7824 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jun 2020 11:45:13 +1000 Subject: [PATCH 025/241] kernel: set USER_DS in kthread_use_mm Some architectures like arm64 and s390 require USER_DS to be set for kernel threads to access user address space, which is the whole purpose of kthread_use_mm, but other like x86 don't. That has lead to a huge mess where some callers are fixed up once they are tested on said architectures, while others linger around and yet other like io_uring try to do "clever" optimizations for what usually is just a trivial asignment to a member in the thread_struct for most architectures. Make kthread_use_mm set USER_DS, and kthread_unuse_mm restore to the previous value instead. Link: http://lkml.kernel.org/r/20200404094101.672954-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Michael S. Tsirkin Reviewed-by: Jens Axboe Tested-by: Jens Axboe Cc: Alex Deucher Cc: Al Viro Cc: Felipe Balbi Cc: Felix Kuehling Cc: Jason Wang Cc: Zhenyu Wang Cc: Zhi Wang Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/usb/gadget/function/f_fs.c | 4 ---- drivers/vhost/vhost.c | 3 --- fs/io-wq.c | 8 ++------ fs/io_uring.c | 4 ---- kernel/kthread.c | 6 ++++++ 5 files changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index f80b2747d7c57e..490d353d5fdecf 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -824,13 +824,9 @@ static void ffs_user_copy_worker(struct work_struct *work) bool kiocb_has_eventfd = io_data->kiocb->ki_flags & IOCB_EVENTFD; if (io_data->read && ret > 0) { - mm_segment_t oldfs = get_fs(); - - set_fs(USER_DS); kthread_use_mm(io_data->mm); ret = ffs_copy_to_iter(io_data->buf, ret, &io_data->data); kthread_unuse_mm(io_data->mm); - set_fs(oldfs); } io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 7c29493ff3f1e4..694f1c31c3ae85 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -336,9 +336,7 @@ static int vhost_worker(void *data) struct vhost_dev *dev = data; struct vhost_work *work, *work_next; struct llist_node *node; - mm_segment_t oldfs = get_fs(); - set_fs(USER_DS); kthread_use_mm(dev->mm); for (;;) { @@ -368,7 +366,6 @@ static int vhost_worker(void *data) } } kthread_unuse_mm(dev->mm); - set_fs(oldfs); return 0; } diff --git a/fs/io-wq.c b/fs/io-wq.c index 748621f7391ec6..a5e90ac39e4d3e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -169,7 +169,6 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) dropped_lock = true; } __set_current_state(TASK_RUNNING); - set_fs(KERNEL_DS); kthread_unuse_mm(worker->mm); mmput(worker->mm); worker->mm = NULL; @@ -421,14 +420,11 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) mmput(worker->mm); worker->mm = NULL; } - if (!work->mm) { - set_fs(KERNEL_DS); + if (!work->mm) return; - } + if (mmget_not_zero(work->mm)) { kthread_use_mm(work->mm); - if (!worker->mm) - set_fs(USER_DS); worker->mm = work->mm; /* hang on to this mm */ work->mm = NULL; diff --git a/fs/io_uring.c b/fs/io_uring.c index 1c0697433f28ed..46a06641704daa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6127,15 +6127,12 @@ static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; const struct cred *old_cred; - mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned long timeout; int ret = 0; complete(&ctx->sq_thread_comp); - old_fs = get_fs(); - set_fs(USER_DS); old_cred = override_creds(ctx->creds); timeout = jiffies + ctx->sq_thread_idle; @@ -6240,7 +6237,6 @@ static int io_sq_thread(void *data) if (current->task_works) task_work_run(); - set_fs(old_fs); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); diff --git a/kernel/kthread.c b/kernel/kthread.c index f4373cca41b011..8e3d2d7fdf5e29 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -53,6 +53,7 @@ struct kthread { unsigned int cpu; int (*threadfn)(void *); void *data; + mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -1252,6 +1253,9 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); + + to_kthread(tsk)->oldfs = get_fs(); + set_fs(USER_DS); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1266,6 +1270,8 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); + set_fs(to_kthread(tsk)->oldfs); + task_lock(tsk); sync_mm_rss(mm); tsk->mm = NULL; From ee37676469319ef038d6ccf97e8e8a8675d0b708 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:13 +1000 Subject: [PATCH 026/241] mm/kmemleak: silence KCSAN splats in checksum Even if KCSAN is disabled for kmemleak, update_checksum() could still call crc32() (which is outside of kmemleak.c) to dereference object->pointer. Thus, the value of object->pointer could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in crc32_le_base / do_raw_spin_lock write to 0xffffb0ea683a7d50 of 4 bytes by task 23575 on cpu 12: do_raw_spin_lock+0x114/0x200 debug_spin_lock_after at kernel/locking/spinlock_debug.c:91 (inlined by) do_raw_spin_lock at kernel/locking/spinlock_debug.c:115 _raw_spin_lock+0x40/0x50 __handle_mm_fault+0xa9e/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffffb0ea683a7d50 of 4 bytes by task 839 on cpu 60: crc32_le_base+0x67/0x350 crc32_le_base+0x67/0x350: crc32_body at lib/crc32.c:106 (inlined by) crc32_le_generic at lib/crc32.c:179 (inlined by) crc32_le at lib/crc32.c:197 kmemleak_scan+0x528/0xd90 update_checksum at mm/kmemleak.c:1172 (inlined by) kmemleak_scan at mm/kmemleak.c:1497 kmemleak_scan_thread+0xcc/0xfa kthread+0x1e0/0x200 ret_from_fork+0x27/0x50 If a shattered value was returned due to a data race, it will be corrected in the next scan. Thus, let KCSAN ignore all reads in the region to silence KCSAN in case the write side is non-atomic. Link: http://lkml.kernel.org/r/20200317182754.2180-1-cai@lca.pw Signed-off-by: Qian Cai Suggested-by: Marco Elver Acked-by: Marco Elver Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/kmemleak.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e362dc3d2028c3..5e252d91eb14b1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1169,8 +1169,10 @@ static bool update_checksum(struct kmemleak_object *object) u32 old_csum = object->checksum; kasan_disable_current(); + kcsan_disable_current(); object->checksum = crc32(0, (void *)object->pointer, object->size); kasan_enable_current(); + kcsan_enable_current(); return object->checksum != old_csum; } From d78e39fee7dc21fcb71ec4bf4d155a1e40a62494 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:14 +1000 Subject: [PATCH 027/241] kallsyms/printk: add loglvl to print_ip_sym() Patch series "Add log level to show_stack()", v3. Add log level argument to show_stack(). Done in three stages: 1. Introducing show_stack_loglvl() for every architecture 2. Migrating old users with an explicit log level 3. Renaming show_stack_loglvl() into show_stack() Justification: - It's a design mistake to move a business-logic decision into platform realization detail. - I have currently two patches sets that would benefit from this work: Removing console_loglevel jumps in sysrq driver [1] Hung task warning before panic [2] - suggested by Tetsuo (but he probably didn't realise what it would involve). - While doing (1), (2) the backtraces were adjusted to headers and other messages for each situation - so there won't be a situation when the backtrace is printed, but the headers are missing because they have lesser log level (or the reverse). - As the result in (2) plays with console_loglevel for kdb are removed. The least important for upstream, but maybe still worth to note that every company I've worked in so far had an off-list patch to print backtrace with the needed log level (but only for the architecture they cared about). If you have other ideas how you will benefit from show_stack() with a log level - please, reply to this cover letter. See also discussion on v1: https://lore.kernel.org/linux-riscv/20191106083538.z5nlpuf64cigxigh@pathway.suse.cz/ This patch (of 50): print_ip_sym() needs to have a log level parameter to comply with other parts being printed. Otherwise, half of the expected backtrace would be printed and other may be missing with some logging level. The following callee(s) are using now the adjusted log level: - microblaze/unwind: the same level as headers & userspace unwind. Note that pr_debug()'s there are for debugging the unwinder itself. - nds32/traps: symbol addresses are printed with the same log level as backtrace headers. - lockdep: ip for locking issues is printed with the same log level as other part of the warning. - sched: ip where preemption was disabled is printed as error like the rest part of the message. - ftrace: bug reports are now consistent in the log level being used. Link: http://lkml.kernel.org/r/20200418201944.482088-2-dima@arista.com Signed-off-by: Dmitry Safonov Acked-by: Steven Rostedt (VMware) Cc: Albert Ou Cc: Ben Segall Cc: Dietmar Eggemann Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Hogan Cc: Juri Lelli Cc: Mel Gorman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vincent Chen Cc: Vincent Guittot Cc: Will Deacon Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Dmitry Safonov Cc: Jiri Slaby Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Aurelien Jacquiot Cc: Mark Salter Cc: Guo Ren Cc: Yoshinori Sato Cc: Brian Cain Cc: Fenghua Yu Cc: Tony Luck Cc: Geert Uytterhoeven Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Rich Felker Cc: "David S. Miller" Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Daniel Thompson Cc: Douglas Anderson Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/kernel/unwind.c | 2 +- arch/mips/kernel/traps.c | 4 ++-- arch/nds32/kernel/traps.c | 4 ++-- arch/riscv/kernel/stacktrace.c | 2 +- include/linux/kallsyms.h | 4 ++-- kernel/locking/lockdep.c | 4 ++-- kernel/sched/core.c | 6 ++---- kernel/trace/ftrace.c | 8 ++++---- tools/include/linux/kallsyms.h | 2 +- 9 files changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 34c270cb11fcbd..4241cdd28ee779 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -254,7 +254,7 @@ static void microblaze_unwind_inner(struct task_struct *task, task->comm); break; } else - print_ip_sym(pc); + print_ip_sym(KERN_INFO, pc); } /* Stop when we reach anything not part of the kernel */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 22f805a73921d6..210fea63de7501 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -125,7 +125,7 @@ static void show_raw_backtrace(unsigned long reg29) break; } if (__kernel_text_address(addr)) - print_ip_sym(addr); + print_ip_sym(KERN_DEFAULT, addr); } printk("\n"); } @@ -155,7 +155,7 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) } printk("Call Trace:\n"); do { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); pc = unwind_stack(task, &sp, pc, &ra); } while (pc); pr_cont("\n"); diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index f4d386b5262276..40625760a125e1 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -108,7 +108,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) if (__kernel_text_address(ret_addr)) { ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; @@ -124,7 +124,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 837b9b38f825cf..9f1ac258482fd7 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -99,7 +99,7 @@ void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); return false; } diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 657a83b943f064..98338dc6b5d275 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -165,9 +165,9 @@ static inline int kallsyms_show_value(void) #endif /*CONFIG_KALLSYMS*/ -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { - printk("[<%px>] %pS\n", (void *) ip, (void *) ip); + printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7899ce4225c037..29a8de4c50b90e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce8e..c06da3c3e317d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d3e3293ee53e2..c163c3531fafc4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2021,11 +2021,11 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) switch (failed) { case -EFAULT: pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2035,11 +2035,11 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) break; case -EPERM: pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { diff --git a/tools/include/linux/kallsyms.h b/tools/include/linux/kallsyms.h index 89ca6fe257ccb9..efb6c3f5f2a9a5 100644 --- a/tools/include/linux/kallsyms.h +++ b/tools/include/linux/kallsyms.h @@ -20,7 +20,7 @@ static inline const char *kallsyms_lookup(unsigned long addr, #include #include -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { char **name; From eaa6f78c2b80cc41c0091d7558cdf3ed416481d2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:15 +1000 Subject: [PATCH 028/241] alpha: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-3-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/kernel/traps.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index f6b9664ac5042d..2402f1777f54ee 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -121,10 +121,10 @@ dik_show_code(unsigned int *pc) } static void -dik_show_trace(unsigned long *sp) +dik_show_trace(unsigned long *sp, const char *loglvl) { long i = 0; - printk("Trace:\n"); + printk("%sTrace:\n", loglvl); while (0x1ff8 & (unsigned long) sp) { extern char _stext[], _etext[]; unsigned long tmp = *sp; @@ -133,24 +133,25 @@ dik_show_trace(unsigned long *sp) continue; if (tmp >= (unsigned long) &_etext) continue; - printk("[<%lx>] %pSR\n", tmp, (void *)tmp); + printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp); if (i > 40) { - printk(" ..."); + printk("%s ...", loglvl); break; } } - printk("\n"); + printk("%s\n", loglvl); } static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { unsigned long *stack; int i; /* - * debugging aid: "show_stack(NULL);" prints the + * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the * back trace for this cpu. */ if(sp==NULL) @@ -163,14 +164,19 @@ void show_stack(struct task_struct *task, unsigned long *sp) if ((i % 4) == 0) { if (i) pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } else { pr_cont(" "); } pr_cont("%016lx", *stack++); } pr_cont("\n"); - dik_show_trace(sp); + dik_show_trace(sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } void @@ -184,7 +190,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs, r9_15); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); dik_show_code((unsigned int *)regs->pc); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { @@ -625,7 +631,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg, printk("gp = %016lx sp = %p\n", regs->gp, regs+1); dik_show_code((unsigned int *)pc); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { printk("die_if_kernel recursion detected.\n"); From 7982fe6b77f59e2ed560b73e3c8a75e3cdef945a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:16 +1000 Subject: [PATCH 029/241] arc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a good side-effect header "Stack Trace:" is now printed with the same log level as the rest of backtrace. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-4-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arc/include/asm/bug.h | 3 ++- arch/arc/kernel/stacktrace.c | 21 +++++++++++++++------ arch/arc/kernel/troubleshoot.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 0be19fd1a4126a..4c453ba96c5195 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -13,7 +13,8 @@ struct task_struct; void show_regs(struct pt_regs *regs); -void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs); +void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl); void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 1e440bbfa876ba..24f9cd8a12c94f 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -158,9 +158,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, /* Call-back which plugs into unwinding core to dump the stack in * case of panic/OOPs/BUG etc */ -static int __print_sym(unsigned int address, void *unused) +static int __print_sym(unsigned int address, void *arg) { - printk(" %pS\n", (void *)address); + const char *loglvl = arg; + + printk("%s %pS\n", loglvl, (void *)address); return 0; } @@ -217,17 +219,24 @@ static int __get_first_nonsched(unsigned int address, void *unused) *------------------------------------------------------------------------- */ -noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs) +noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl) { - pr_info("\nStack Trace:\n"); - arc_unwind_core(tsk, regs, __print_sym, NULL); + printk("%s\nStack Trace:\n", loglvl); + arc_unwind_core(tsk, regs, __print_sym, (void *)loglvl); } EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) +{ + show_stacktrace(tsk, NULL, loglvl); +} + void show_stack(struct task_struct *tsk, unsigned long *sp) { - show_stacktrace(tsk, NULL); + show_stack_loglvl(tsk, sp, KERN_DEFAULT); } /* Another API expected by schedular, shows up in "ps" as Wait Channel diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 3393558876a9b1..3044c8347b746e 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -240,5 +240,5 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, /* Show stack trace if this Fatality happened in kernel mode */ if (!user_mode(regs)) - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); } From d419f6902986813bbab3edcd17e95067ee6261e0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:17 +1000 Subject: [PATCH 030/241] arm/asm: add loglvl to c_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to c_backtrace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-5-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/include/asm/bug.h | 3 ++- arch/arm/include/asm/traps.h | 3 ++- arch/arm/kernel/traps.c | 9 +++++---- arch/arm/kernel/unwind.c | 2 +- arch/arm/lib/backtrace-clang.S | 9 +++++++-- arch/arm/lib/backtrace.S | 14 ++++++++++---- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index deef4d0cb3b500..673c7dd75ab90c 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -82,7 +82,8 @@ void hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, int code, const char *name); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp, int pmode, + const char *loglvl); struct mm_struct; void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index 172b08ff3760d4..987fefb0a4dbc4 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -29,7 +29,8 @@ static inline int __in_irqentry_text(unsigned long ptr) } extern void __init early_trap_init(void *); -extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame); +extern void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl); extern void ptrace_break(struct pt_regs *regs); extern void *vectors_page; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0ff2..2030611f22b8eb 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -62,7 +62,8 @@ __setup("user_debug=", user_debug_setup); static void dump_mem(const char *, const char *, unsigned long, unsigned long); -void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame) +void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl) { unsigned long end = frame + 4 + sizeof(struct pt_regs); @@ -76,7 +77,7 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long dump_mem("", "Exception stack", frame + 4, end); } -void dump_backtrace_stm(u32 *stack, u32 instruction) +void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) { char str[80], *p; unsigned int x; @@ -238,7 +239,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp, mode, NULL); } #endif @@ -666,7 +667,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) dump_instr("", regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs)); + c_backtrace(frame_pointer(regs), processor_mode(regs), NULL); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 11a964fd66f474..343cc27b36c4c1 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -493,7 +493,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, NULL); } } diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 2ff375144b55b0..6174c45f53a5db 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -17,6 +17,7 @@ #define sv_pc r6 #define mask r7 #define sv_lr r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -99,6 +100,7 @@ ENDPROC(c_backtrace) @ to ensure 8 byte alignment movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? moveq mask, #0xfc000003 movne mask, #0 @ mask for 32-bit @@ -167,6 +169,7 @@ finished_setup: mov r1, sv_lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry /* @@ -183,6 +186,7 @@ finished_setup: ldr r0, [frame] @ locals are stored in @ the preceding frame subeq r0, r0, #4 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers /* @@ -196,7 +200,8 @@ finished_setup: bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) @@ -209,7 +214,7 @@ ENDPROC(c_backtrace) .long 1005b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Lopcode: .word 0xe92d4800 >> 11 @ stmfd sp!, {... fp, lr} .word 0x0b000000 @ bl if these bits are set diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 582925238d65ea..872f658638d997 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -18,6 +18,7 @@ #define sv_pc r6 #define mask r7 #define offset r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -25,9 +26,10 @@ ENTRY(c_backtrace) ret lr ENDPROC(c_backtrace) #else - stmfd sp!, {r4 - r8, lr} @ Save an extra register so we have a location... + stmfd sp!, {r4 - r9, lr} @ Save an extra register so we have a location... movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? ARM( moveq mask, #0xfc000003 ) @@ -73,6 +75,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions ldr r1, [frame, #-4] @ get saved lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry ldr r1, [sv_pc, #-4] @ if stmfd sp!, {args} exists, @@ -80,12 +83,14 @@ for_each_frame: tst frame, mask @ Check for address exceptions teq r3, r1, lsr #11 ldreq r0, [frame, #-8] @ get sp subeq r0, r0, #4 @ point at the last arg + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers 1004: ldr r1, [sv_pc, #0] @ if stmfd sp!, {..., fp, ip, lr, pc} ldr r3, .Ldsi @ instruction exists, teq r3, r1, lsr #11 subeq r0, frame, #16 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers teq sv_fp, #0 @ zero saved fp means @@ -96,9 +101,10 @@ for_each_frame: tst frame, mask @ Check for address exceptions bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk -no_frame: ldmfd sp!, {r4 - r8, pc} +no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -109,7 +115,7 @@ ENDPROC(c_backtrace) .long 1004b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0xe92dd800 >> 11 @ stmfd sp!, {... fp, ip, lr, pc} .word 0xe92d0000 >> 11 @ stmfd sp!, {} From 415ca2fe610c12e52fe658044b4b6fb964a4bbce Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:18 +1000 Subject: [PATCH 031/241] arm: add loglvl to unwind_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to unwind_backtrace() as a preparation for introducing show_stack_loglvl(). As a good side-effect arm_syscall() is now printing errors with the same log level as the backtrace. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-6-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/include/asm/unwind.h | 3 ++- arch/arm/kernel/traps.c | 6 +++--- arch/arm/kernel/unwind.c | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 6e282c33126b6f..0f8a3439902d06 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -36,7 +36,8 @@ extern struct unwind_table *unwind_table_add(unsigned long start, unsigned long text_addr, unsigned long text_size); extern void unwind_table_del(struct unwind_table *tab); -extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2030611f22b8eb..685e17c2e2756d 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -204,7 +204,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) #ifdef CONFIG_ARM_UNWIND static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { - unwind_backtrace(regs, tsk); + unwind_backtrace(regs, tsk, KERN_DEFAULT); } #else static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) @@ -664,10 +664,10 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) if (user_debug & UDBG_SYSCALL) { pr_err("[%d] %s: arm syscall %d\n", task_pid_nr(current), current->comm, no); - dump_instr("", regs); + dump_instr(KERN_ERR, regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs), NULL); + c_backtrace(frame_pointer(regs), processor_mode(regs), KERN_ERR); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 343cc27b36c4c1..d2bd0df2318d62 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -455,7 +455,8 @@ int unwind_frame(struct stackframe *frame) return URC_OK; } -void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; @@ -493,7 +494,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4, NULL); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, loglvl); } } From 8d5430fb43870f91bc9675646bace813043384e8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:18 +1000 Subject: [PATCH 032/241] arm: add loglvl to dump_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_backtrace() as a preparation for introducing show_stack_loglvl(). As a good side-effect __die() now prints not only "Stack:" header with KERN_EMERG, but the backtrace itself. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-7-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/kernel/traps.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 685e17c2e2756d..0f09ace18e6cae 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -202,17 +202,19 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } #ifdef CONFIG_ARM_UNWIND -static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - unwind_backtrace(regs, tsk, KERN_DEFAULT); + unwind_backtrace(regs, tsk, loglvl); } #else -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp, mode; int ok = 1; - printk("Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -239,13 +241,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode, NULL); + c_backtrace(fp, mode, loglvl); } #endif void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, KERN_DEFAULT); barrier(); } @@ -289,7 +291,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } From 80698a322e5c20253d684201c39ccfca6a883cde Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:19 +1000 Subject: [PATCH 033/241] arm: wire up dump_backtrace_{entry,stm} Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Now that c_backtrace() always emits correct loglvl, use it for printing. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-8-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/kernel/traps.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 0f09ace18e6cae..e1be6c85327c26 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -68,13 +68,15 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", where, (void *)where, from, (void *)from); + printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", + loglvl, where, (void *)where, from, (void *)from); #else - printk("Function entered at [<%08lx>] from [<%08lx>]\n", where, from); + printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", + loglvl, where, from); #endif if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem("", "Exception stack", frame + 4, end); + dump_mem(loglvl, "Exception stack", frame + 4, end); } void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) @@ -89,12 +91,12 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) if (++x == 6) { x = 0; p = str; - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } } } if (p != str) - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } #ifndef CONFIG_ARM_UNWIND From a4e92486552770d1e7f93fad9259f111b58f36c2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:20 +1000 Subject: [PATCH 034/241] arm: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-9-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index e1be6c85327c26..00455b5bbf8aeb 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -247,12 +247,18 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, } #endif -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { - dump_backtrace(NULL, tsk, KERN_DEFAULT); + dump_backtrace(NULL, tsk, loglvl); barrier(); } +void show_stack(struct task_struct *tsk, unsigned long *sp) +{ + show_stack_loglvl(tsk, sp, KERN_DEFAULT); +} + #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) From f8eeb118ca1862a2f50b41ac2e2d2aa1b35218c4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:21 +1000 Subject: [PATCH 035/241] arm64: add loglvl to dump_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_backtrace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-10-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/stacktrace.h | 3 ++- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/traps.c | 15 ++++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 5017b531a41531..fc7613023c1920 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -64,7 +64,8 @@ struct stackframe { extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); -extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index eade7807e819d5..6089638c7d43f4 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -306,7 +306,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs * regs) { __show_regs(regs); - dump_backtrace(regs, NULL); + dump_backtrace(regs, NULL, KERN_DEFAULT); } static void tls_thread_flush(void) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index d332590f59782c..f602aca64baaf8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -53,9 +53,9 @@ static const char *handler[]= { int show_unhandled_signals = 0; -static void dump_backtrace_entry(unsigned long where) +static void dump_backtrace_entry(unsigned long where, const char *loglvl) { - printk(" %pS\n", (void *)where); + printk("%s %pS\n", loglvl, (void *)where); } static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) @@ -83,7 +83,8 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; int skip = 0; @@ -115,11 +116,11 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) thread_saved_pc(tsk)); } - printk("Call trace:\n"); + printk("%sCall trace:\n", loglvl); do { /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(frame.pc); + dump_backtrace_entry(frame.pc, loglvl); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -129,7 +130,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) * at which an exception has taken place, use regs->pc * instead. */ - dump_backtrace_entry(regs->pc); + dump_backtrace_entry(regs->pc, loglvl); } } while (!unwind_frame(tsk, &frame)); @@ -138,7 +139,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, KERN_DEFAULT); barrier(); } From c0ba0e57748bc7b3f729e20479ff4978dacf5be1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:22 +1000 Subject: [PATCH 036/241] arm64: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-11-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f602aca64baaf8..3621868b2fcc81 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -137,12 +137,18 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, put_task_stack(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { - dump_backtrace(NULL, tsk, KERN_DEFAULT); + dump_backtrace(NULL, tsk, loglvl); barrier(); } +void show_stack(struct task_struct *tsk, unsigned long *sp) +{ + show_stack_loglvl(tsk, sp, KERN_DEFAULT); +} + #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) From c7413db0a619f5a2b0d4e790a05896f79e0867d4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:23 +1000 Subject: [PATCH 037/241] c6x: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-12-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Aurelien Jacquiot Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/c6x/kernel/traps.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index ec61034fdf56cc..4afbf48f1ce004 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -344,12 +344,13 @@ asmlinkage int process_exception(struct pt_regs *regs) static int kstack_depth_to_print = 48; -static void show_trace(unsigned long *stack, unsigned long *endstack) +static void show_trace(unsigned long *stack, unsigned long *endstack, + const char *loglvl) { unsigned long addr; int i; - pr_debug("Call trace:"); + printk("%sCall trace:", loglvl); i = 0; while (stack + 1 <= endstack) { addr = *stack++; @@ -364,16 +365,17 @@ static void show_trace(unsigned long *stack, unsigned long *endstack) if (__kernel_text_address(addr)) { #ifndef CONFIG_KALLSYMS if (i % 5 == 0) - pr_debug("\n "); + printk("%s\n ", loglvl); #endif - pr_debug(" [<%08lx>] %pS\n", addr, (void *)addr); + printk("%s [<%08lx>] %pS\n", loglvl, addr, (void *)addr); i++; } } - pr_debug("\n"); + printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p, *endstack; int i; @@ -398,7 +400,12 @@ void show_stack(struct task_struct *task, unsigned long *stack) pr_cont(" %08lx", *p++); } pr_cont("\n"); - show_trace(stack, endstack); + show_trace(stack, endstack, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_DEBUG); } int is_valid_bugaddr(unsigned long addr) From 81214f431f9a272dd26d690a499bbc6702f1df9b Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:23 +1000 Subject: [PATCH 038/241] csky: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-13-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Guo Ren Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/csky/kernel/stacktrace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index 92809e1da723c8..ca135f13cc1383 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -91,14 +91,21 @@ static void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(pc); + print_ip_sym((const char *)arg, pc); return false; } +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) +{ + pr_cont("Call Trace:\n"); + walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); +} + void show_stack(struct task_struct *task, unsigned long *sp) { pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, NULL); + walk_stackframe(task, NULL, print_trace_address, KERN_INFO); } static bool save_wchan(unsigned long pc, void *arg) From 1f03f80fdc7dff216a1f3a9f9f143e470d1fb23a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:24 +1000 Subject: [PATCH 039/241] h8300: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-14-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/h8300/kernel/traps.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index e47a9e0dc278fa..6362446563d6ab 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -115,7 +115,8 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack_loglvl(struct task_struct *task, unsigned long *esp, + const char *loglvl) { unsigned long *stack, addr; int i; @@ -125,17 +126,17 @@ void show_stack(struct task_struct *task, unsigned long *esp) stack = esp; - pr_info("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); for (i = 0; i < kstack_depth_to_print; i++) { if (((unsigned long)stack & (THREAD_SIZE - 1)) >= THREAD_SIZE-4) break; if (i % 8 == 0) - pr_info(" "); + printk("%s ", loglvl); pr_cont(" %08lx", *stack++); } - pr_info("\nCall Trace:\n"); + printk("%s\nCall Trace:\n", loglvl); i = 0; stack = esp; while (((unsigned long)stack & (THREAD_SIZE - 1)) < THREAD_SIZE-4) { @@ -150,10 +151,15 @@ void show_stack(struct task_struct *task, unsigned long *esp) */ if (check_kernel_text(addr)) { if (i % 4 == 0) - pr_info(" "); + printk("%s ", loglvl); pr_cont(" [<%08lx>]", addr); i++; } } - pr_info("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + show_stack_loglvl(task, esp, KERN_INFO); } From 816414b009e2390b438515f7606165ca4cff3cf2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:25 +1000 Subject: [PATCH 040/241] hexagon: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a good side-effect die() now prints the stacktrace with KERN_EMERG aligned with other messages. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-15-dima@arista.com Signed-off-by: Dmitry Safonov Acked-by: Brian Cain Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/hexagon/kernel/traps.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 69c623b14ddd28..a8a3a210d7810c 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -79,7 +79,7 @@ static const char *ex_name(int ex) } static void do_show_stack(struct task_struct *task, unsigned long *fp, - unsigned long ip) + unsigned long ip, const char *loglvl) { int kstack_depth_to_print = 24; unsigned long offset, size; @@ -93,9 +93,8 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, if (task == NULL) task = current; - printk(KERN_INFO "CPU#%d, %s/%d, Call Trace:\n", - raw_smp_processor_id(), task->comm, - task_pid_nr(task)); + printk("%sCPU#%d, %s/%d, Call Trace:\n", loglvl, raw_smp_processor_id(), + task->comm, task_pid_nr(task)); if (fp == NULL) { if (task == current) { @@ -108,7 +107,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } if ((((unsigned long) fp) & 0x3) || ((unsigned long) fp < 0x1000)) { - printk(KERN_INFO "-- Corrupt frame pointer %p\n", fp); + printk("%s-- Corrupt frame pointer %p\n", loglvl, fp); return; } @@ -125,8 +124,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, name = kallsyms_lookup(ip, &size, &offset, &modname, tmpstr); - printk(KERN_INFO "[%p] 0x%lx: %s + 0x%lx", fp, ip, name, - offset); + printk("%s[%p] 0x%lx: %s + 0x%lx", loglvl, fp, ip, name, offset); if (((unsigned long) fp < low) || (high < (unsigned long) fp)) printk(KERN_CONT " (FP out of bounds!)"); if (modname) @@ -136,8 +134,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, newfp = (unsigned long *) *fp; if (((unsigned long) newfp) & 0x3) { - printk(KERN_INFO "-- Corrupt frame pointer %p\n", - newfp); + printk("%s-- Corrupt frame pointer %p\n", loglvl, newfp); break; } @@ -147,7 +144,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, + 8); if (regs->syscall_nr != -1) { - printk(KERN_INFO "-- trap0 -- syscall_nr: %ld", + printk("%s-- trap0 -- syscall_nr: %ld", loglvl, regs->syscall_nr); printk(KERN_CONT " psp: %lx elr: %lx\n", pt_psp(regs), pt_elr(regs)); @@ -155,7 +152,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } else { /* really want to see more ... */ kstack_depth_to_print += 6; - printk(KERN_INFO "-- %s (0x%lx) badva: %lx\n", + printk("%s-- %s (0x%lx) badva: %lx\n", loglvl, ex_name(pt_cause(regs)), pt_cause(regs), pt_badva(regs)); } @@ -178,10 +175,16 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } } -void show_stack(struct task_struct *task, unsigned long *fp) +void show_stack_loglvl(struct task_struct *task, unsigned long *fp, + const char *loglvl) { /* Saved link reg is one word above FP */ - do_show_stack(task, fp, 0); + do_show_stack(task, fp, 0, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *fp) +{ + show_stack_loglvl(task, fp, 0, KERN_INFO); } int die(const char *str, struct pt_regs *regs, long err) @@ -207,7 +210,7 @@ int die(const char *str, struct pt_regs *regs, long err) print_modules(); show_regs(regs); - do_show_stack(current, ®s->r30, pt_elr(regs)); + do_show_stack(current, ®s->r30, pt_elr(regs), KERN_EMERG); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); From 32f3c94a5d92d7d1d4157cb98e98c01d0a318345 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:25 +1000 Subject: [PATCH 041/241] ia64: pass log level as arg into ia64_do_show_stack() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to ia64_do_show_stack() as a preparation to introduce show_stack_loglvl(). Also, make ia64_do_show_stack() static as it's not used outside. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-16-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Fenghua Yu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/include/asm/ptrace.h | 1 - arch/ia64/kernel/process.c | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 7ff574d56429c2..b3aa4609010129 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -114,7 +114,6 @@ static inline long regs_return_value(struct pt_regs *regs) struct task_struct; /* forward decl */ struct unw_frame_info; /* forward decl */ - extern void ia64_do_show_stack (struct unw_frame_info *, void *); extern unsigned long ia64_get_user_rbs_end (struct task_struct *, struct pt_regs *, unsigned long *); extern long ia64_peek (struct task_struct *, struct switch_stack *, unsigned long, diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 10cb9382ab76c4..332c6dfe733389 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -64,12 +64,13 @@ EXPORT_SYMBOL(boot_option_idle_override); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); -void +static void ia64_do_show_stack (struct unw_frame_info *info, void *arg) { unsigned long ip, sp, bsp; + const char *loglvl = arg; - printk("\nCall Trace:\n"); + printk("%s\nCall Trace:\n", loglvl); do { unw_get_ip(info, &ip); if (ip == 0) @@ -77,9 +78,9 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) unw_get_sp(info, &sp); unw_get_bsp(info, &bsp); - printk(" [<%016lx>] %pS\n" + printk("%s [<%016lx>] %pS\n" " sp=%016lx bsp=%016lx\n", - ip, (void *)ip, sp, bsp); + loglvl, ip, (void *)ip, sp, bsp); } while (unw_unwind(info) >= 0); } @@ -87,12 +88,12 @@ void show_stack (struct task_struct *task, unsigned long *sp) { if (!task) - unw_init_running(ia64_do_show_stack, NULL); + unw_init_running(ia64_do_show_stack, (void *)KERN_DEFAULT); else { struct unw_frame_info info; unw_init_from_blocked_task(&info, task); - ia64_do_show_stack(&info, NULL); + ia64_do_show_stack(&info, (void *)KERN_DEFAULT); } } From 1203559e305ee754371e3bfa240e363ba4768104 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:26 +1000 Subject: [PATCH 042/241] ia64: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-17-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Fenghua Yu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/kernel/process.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 332c6dfe733389..913d9a01cbf9b3 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -85,18 +85,25 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) } void -show_stack (struct task_struct *task, unsigned long *sp) +show_stack_loglvl (struct task_struct *task, unsigned long *sp, + const char *loglvl) { if (!task) - unw_init_running(ia64_do_show_stack, (void *)KERN_DEFAULT); + unw_init_running(ia64_do_show_stack, (void *)loglvl); else { struct unw_frame_info info; unw_init_from_blocked_task(&info, task); - ia64_do_show_stack(&info, (void *)KERN_DEFAULT); + ia64_do_show_stack(&info, (void *)loglvl); } } +void +show_stack (struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); +} + void show_regs (struct pt_regs *regs) { From e8d6e4a07a7c3ea719fdd7d2525826c14569b3e6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:27 +1000 Subject: [PATCH 043/241] m68k: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-18-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/m68k/kernel/traps.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 344f93d36a9a08..ffcc5ec4fac3b4 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -811,13 +811,13 @@ asmlinkage void buserr_c(struct frame *fp) static int kstack_depth_to_print = 48; -void show_trace(unsigned long *stack) +static void show_trace(unsigned long *stack, const char *loglvl) { unsigned long *endstack; unsigned long addr; int i; - pr_info("Call Trace:"); + printk("%sCall Trace:", loglvl); addr = (unsigned long)stack + THREAD_SIZE - 1; endstack = (unsigned long *)(addr & -THREAD_SIZE); i = 0; @@ -935,7 +935,8 @@ void show_registers(struct pt_regs *regs) pr_cont("\n"); } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p; unsigned long *endstack; @@ -949,7 +950,7 @@ void show_stack(struct task_struct *task, unsigned long *stack) } endstack = (unsigned long *)(((unsigned long)stack + THREAD_SIZE - 1) & -THREAD_SIZE); - pr_info("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); p = stack; for (i = 0; i < kstack_depth_to_print; i++) { if (p + 1 > endstack) @@ -959,7 +960,12 @@ void show_stack(struct task_struct *task, unsigned long *stack) pr_cont(" %08lx", *p++); } pr_cont("\n"); - show_trace(stack); + show_trace(stack, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_INFO); } /* From b613263e4dd0212450c066cfd94a345cbefc9d1c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:27 +1000 Subject: [PATCH 044/241] microblaze: add loglvl to microblaze_unwind_inner() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to microblaze_unwind_inner() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-19-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/kernel/unwind.c | 38 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 4241cdd28ee779..804bf0c99d8bff 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -154,7 +154,8 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, static void microblaze_unwind_inner(struct task_struct *task, unsigned long pc, unsigned long fp, unsigned long leaf_return, - struct stack_trace *trace); + struct stack_trace *trace, + const char *loglvl); /** * unwind_trap - Unwind through a system trap, that stored previous state @@ -162,16 +163,18 @@ static void microblaze_unwind_inner(struct task_struct *task, */ #ifdef CONFIG_MMU static inline void unwind_trap(struct task_struct *task, unsigned long pc, - unsigned long fp, struct stack_trace *trace) + unsigned long fp, struct stack_trace *trace, + const char *loglvl) { /* To be implemented */ } #else static inline void unwind_trap(struct task_struct *task, unsigned long pc, - unsigned long fp, struct stack_trace *trace) + unsigned long fp, struct stack_trace *trace, + const char *loglvl) { const struct pt_regs *regs = (const struct pt_regs *) fp; - microblaze_unwind_inner(task, regs->pc, regs->r1, regs->r15, trace); + microblaze_unwind_inner(task, regs->pc, regs->r1, regs->r15, trace, loglvl); } #endif @@ -184,11 +187,13 @@ static inline void unwind_trap(struct task_struct *task, unsigned long pc, * the caller's return address. * @trace : Where to store stack backtrace (PC values). * NULL == print backtrace to kernel log + * @loglvl : Used for printk log level if (trace == NULL). */ static void microblaze_unwind_inner(struct task_struct *task, unsigned long pc, unsigned long fp, unsigned long leaf_return, - struct stack_trace *trace) + struct stack_trace *trace, + const char *loglvl) { int ofs = 0; @@ -214,11 +219,11 @@ static void microblaze_unwind_inner(struct task_struct *task, const struct pt_regs *regs = (const struct pt_regs *) fp; #endif - pr_info("HW EXCEPTION\n"); + printk("%sHW EXCEPTION\n", loglvl); #ifndef CONFIG_MMU microblaze_unwind_inner(task, regs->r17 - 4, fp + EX_HANDLER_STACK_SIZ, - regs->r15, trace); + regs->r15, trace, loglvl); #endif return; } @@ -228,8 +233,8 @@ static void microblaze_unwind_inner(struct task_struct *task, if ((return_to >= handler->start_addr) && (return_to <= handler->end_addr)) { if (!trace) - pr_info("%s\n", handler->trap_name); - unwind_trap(task, pc, fp, trace); + printk("%s%s\n", loglvl, handler->trap_name); + unwind_trap(task, pc, fp, trace, loglvl); return; } } @@ -248,13 +253,13 @@ static void microblaze_unwind_inner(struct task_struct *task, } else { /* Have we reached userland? */ if (unlikely(pc == task_pt_regs(task)->pc)) { - pr_info("[<%p>] PID %lu [%s]\n", - (void *) pc, + printk("%s[<%p>] PID %lu [%s]\n", + loglvl, (void *) pc, (unsigned long) task->pid, task->comm); break; } else - print_ip_sym(KERN_INFO, pc); + print_ip_sym(loglvl, pc); } /* Stop when we reach anything not part of the kernel */ @@ -285,11 +290,13 @@ static void microblaze_unwind_inner(struct task_struct *task, */ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) { + const char *loglvl = KERN_INFO; + if (task) { if (task == current) { const struct pt_regs *regs = task_pt_regs(task); microblaze_unwind_inner(task, regs->pc, regs->r1, - regs->r15, trace); + regs->r15, trace, loglvl); } else { struct thread_info *thread_info = (struct thread_info *)(task->stack); @@ -299,7 +306,8 @@ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) microblaze_unwind_inner(task, (unsigned long) &_switch_to, cpu_context->r1, - cpu_context->r15, trace); + cpu_context->r15, + trace, loglvl); } } else { unsigned long pc, fp; @@ -314,7 +322,7 @@ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) ); /* Since we are not a leaf function, use leaf_return = 0 */ - microblaze_unwind_inner(current, pc, fp, 0, trace); + microblaze_unwind_inner(current, pc, fp, 0, trace, loglvl); } } From c14a5d4a3d2aa04b923c34f52f972ac8d0bc6d25 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:28 +1000 Subject: [PATCH 045/241] microblaze: add loglvl to microblaze_unwind() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to microblaze_unwind() as a preparation to add show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-20-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/include/asm/unwind.h | 3 ++- arch/microblaze/kernel/stacktrace.c | 4 ++-- arch/microblaze/kernel/traps.c | 2 +- arch/microblaze/kernel/unwind.c | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/microblaze/include/asm/unwind.h b/arch/microblaze/include/asm/unwind.h index c327d673622af6..3db81777a8872e 100644 --- a/arch/microblaze/include/asm/unwind.h +++ b/arch/microblaze/include/asm/unwind.h @@ -20,7 +20,8 @@ extern struct trap_handler_info microblaze_trap_handlers; extern const char _hw_exception_handler; extern const char ex_handler_unhandled; -void microblaze_unwind(struct task_struct *task, struct stack_trace *trace); +void microblaze_unwind(struct task_struct *task, struct stack_trace *trace, + const char *loglvl); #endif /* __MICROBLAZE_UNWIND_H */ diff --git a/arch/microblaze/kernel/stacktrace.c b/arch/microblaze/kernel/stacktrace.c index b4debe283a79e2..b266c4d6ed9df7 100644 --- a/arch/microblaze/kernel/stacktrace.c +++ b/arch/microblaze/kernel/stacktrace.c @@ -20,12 +20,12 @@ void save_stack_trace(struct stack_trace *trace) { /* Exclude our helper functions from the trace*/ trace->skip += 2; - microblaze_unwind(NULL, trace); + microblaze_unwind(NULL, trace, ""); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - microblaze_unwind(tsk, trace); + microblaze_unwind(tsk, trace, ""); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 45bbba9d919f91..be726ee120fb55 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -68,7 +68,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, words_to_show << 2, 0); pr_info("\n\nCall Trace:\n"); - microblaze_unwind(task, NULL); + microblaze_unwind(task, NULL, KERN_INFO); pr_info("\n"); if (!task) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 804bf0c99d8bff..778a761af0a7af 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -287,11 +287,11 @@ static void microblaze_unwind_inner(struct task_struct *task, * @task : Task whose stack we are to unwind (NULL == current) * @trace : Where to store stack backtrace (PC values). * NULL == print backtrace to kernel log + * @loglvl : Used for printk log level if (trace == NULL). */ -void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) +void microblaze_unwind(struct task_struct *task, struct stack_trace *trace, + const char *loglvl) { - const char *loglvl = KERN_INFO; - if (task) { if (task == current) { const struct pt_regs *regs = task_pt_regs(task); From 4960fcd5a47e4f106e6bbb810e970d449173a193 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:28 +1000 Subject: [PATCH 046/241] microblaze: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-21-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/microblaze/kernel/traps.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index be726ee120fb55..149ae534937efb 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -31,7 +31,8 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { unsigned long words_to_show; u32 fp = (u32) sp; @@ -50,7 +51,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (kstack_depth_to_print && (words_to_show > kstack_depth_to_print)) words_to_show = kstack_depth_to_print; - pr_info("Kernel Stack:\n"); + printk("%sKernel Stack:\n", loglvl); /* * Make the first line an 'odd' size if necessary to get @@ -65,14 +66,19 @@ void show_stack(struct task_struct *task, unsigned long *sp) words_to_show -= line1_words; } } - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, + print_hex_dump(loglvl, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, words_to_show << 2, 0); - pr_info("\n\nCall Trace:\n"); - microblaze_unwind(task, NULL, KERN_INFO); - pr_info("\n"); + printk("%s\n\nCall Trace:\n", loglvl); + microblaze_unwind(task, NULL, loglvl); + printk("%s\n", loglvl); if (!task) task = current; debug_show_held_locks(task); } + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_INFO); +} From 5ff1d3f42e3b27682df0f02a6e447e5e73ea95cc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:29 +1000 Subject: [PATCH 047/241] mips: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-22-dima@arista.com Signed-off-by: Dmitry Safonov Cc: James Hogan Cc: Paul Burton Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/mips/kernel/traps.c | 41 +++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 210fea63de7501..e49040739b61d5 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -108,26 +108,26 @@ void (*board_bind_eic_interrupt)(int irq, int regset); void (*board_ebase_setup)(void); void(*board_cache_error_setup)(void); -static void show_raw_backtrace(unsigned long reg29) +static void show_raw_backtrace(unsigned long reg29, const char *loglvl) { unsigned long *sp = (unsigned long *)(reg29 & ~3); unsigned long addr; - printk("Call Trace:"); + printk("%sCall Trace:", loglvl); #ifdef CONFIG_KALLSYMS - printk("\n"); + printk("%s\n", loglvl); #endif while (!kstack_end(sp)) { unsigned long __user *p = (unsigned long __user *)(unsigned long)sp++; if (__get_user(addr, p)) { - printk(" (Bad stack address)"); + printk("%s (Bad stack address)", loglvl); break; } if (__kernel_text_address(addr)) - print_ip_sym(KERN_DEFAULT, addr); + print_ip_sym(loglvl, addr); } - printk("\n"); + printk("%s\n", loglvl); } #ifdef CONFIG_KALLSYMS @@ -140,7 +140,8 @@ static int __init set_raw_show_trace(char *str) __setup("raw_show_trace", set_raw_show_trace); #endif -static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) +static void show_backtrace(struct task_struct *task, const struct pt_regs *regs, + const char *loglvl) { unsigned long sp = regs->regs[29]; unsigned long ra = regs->regs[31]; @@ -150,12 +151,12 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) task = current; if (raw_show_trace || user_mode(regs) || !__kernel_text_address(pc)) { - show_raw_backtrace(sp); + show_raw_backtrace(sp, loglvl); return; } - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { - print_ip_sym(KERN_DEFAULT, pc); + print_ip_sym(loglvl, pc); pc = unwind_stack(task, &sp, pc, &ra); } while (pc); pr_cont("\n"); @@ -166,19 +167,19 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) * with at least a bit of error checking ... */ static void show_stacktrace(struct task_struct *task, - const struct pt_regs *regs) + const struct pt_regs *regs, const char *loglvl) { const int field = 2 * sizeof(unsigned long); long stackdata; int i; unsigned long __user *sp = (unsigned long __user *)regs->regs[29]; - printk("Stack :"); + printk("%sStack :", loglvl); i = 0; while ((unsigned long) sp & (PAGE_SIZE - 1)) { if (i && ((i % (64 / field)) == 0)) { pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } if (i > 39) { pr_cont(" ..."); @@ -194,10 +195,11 @@ static void show_stacktrace(struct task_struct *task, i++; } pr_cont("\n"); - show_backtrace(task, regs); + show_backtrace(task, regs, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { struct pt_regs regs; mm_segment_t old_fs = get_fs(); @@ -221,10 +223,15 @@ void show_stack(struct task_struct *task, unsigned long *sp) * the stack in the kernel (not user) address space. */ set_fs(KERNEL_DS); - show_stacktrace(task, ®s); + show_stacktrace(task, ®s, loglvl); set_fs(old_fs); } +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT) +} + static void show_code(unsigned int __user *pc) { long i; @@ -373,7 +380,7 @@ void show_registers(struct pt_regs *regs) if (!user_mode(regs)) /* Necessary for getting the correct stack content */ set_fs(KERNEL_DS); - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); show_code((unsigned int __user *) regs->cp0_epc); printk("\n"); set_fs(old_fs); From 8d322a30642fa76d0e6c45d911b5fd81f4dd8df8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:30 +1000 Subject: [PATCH 048/241] nds32: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-23-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Greentime Hu Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/nds32/kernel/traps.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 40625760a125e1..90f12582c218a7 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -97,18 +97,19 @@ static void dump_instr(struct pt_regs *regs) } #define LOOP_TIMES (100) -static void __dump(struct task_struct *tsk, unsigned long *base_reg) +static void __dump(struct task_struct *tsk, unsigned long *base_reg, + const char *loglvl) { unsigned long ret_addr; int cnt = LOOP_TIMES, graph = 0; - pr_emerg("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); if (!IS_ENABLED(CONFIG_FRAME_POINTER)) { while (!kstack_end(base_reg)) { ret_addr = *base_reg++; if (__kernel_text_address(ret_addr)) { ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(KERN_EMERG, ret_addr); + print_ip_sym(loglvl, ret_addr); } if (--cnt < 0) break; @@ -124,17 +125,18 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(KERN_EMERG, ret_addr); + print_ip_sym(loglvl, ret_addr); } if (--cnt < 0) break; base_reg = (unsigned long *)next_fp; } } - pr_emerg("\n"); + printk("%s\n", loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { unsigned long *base_reg; @@ -151,10 +153,15 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) else __asm__ __volatile__("\tori\t%0, $fp, #0\n":"=r"(base_reg)); } - __dump(tsk, base_reg); + __dump(tsk, base_reg, loglvl); barrier(); } +void show_stack(struct task_struct *tsk, unsigned long *sp) +{ + show_stack_loglvl(tsk, sp, KERN_EMERG); +} + DEFINE_SPINLOCK(die_lock); /* From 19b0cea605be0dae21a269112ce57a5630808914 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:31 +1000 Subject: [PATCH 049/241] nios2: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-24-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Ley Foon Tan Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/nios2/kernel/traps.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 486db793923c03..08071caa9b36fa 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -52,12 +52,14 @@ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) } /* - * The show_stack is an external API which we do not use ourselves. + * The show_stack(), show_stack_loglvl() are external API + * which we do not use ourselves. */ int kstack_depth_to_print = 48; -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *endstack, addr; int i; @@ -72,16 +74,16 @@ void show_stack(struct task_struct *task, unsigned long *stack) addr = (unsigned long) stack; endstack = (unsigned long *) PAGE_ALIGN(addr); - pr_emerg("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); for (i = 0; i < kstack_depth_to_print; i++) { if (stack + 1 > endstack) break; if (i % 8 == 0) - pr_emerg("\n "); - pr_emerg(" %08lx", *stack++); + printk("%s\n ", loglvl); + printk("%s %08lx", loglvl, *stack++); } - pr_emerg("\nCall Trace:"); + printk("%s\nCall Trace:", loglvl); i = 0; while (stack + 1 <= endstack) { addr = *stack++; @@ -97,11 +99,16 @@ void show_stack(struct task_struct *task, unsigned long *stack) (addr <= (unsigned long) _etext))) { if (i % 4 == 0) pr_emerg("\n "); - pr_emerg(" [<%08lx>]", addr); + printk("%s [<%08lx>]", loglvl, addr); i++; } } - pr_emerg("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_EMERG); } void __init trap_init(void) From 7fa8aa84ef1b9dff26b1181edeb0ece196dab8c7 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:32 +1000 Subject: [PATCH 050/241] openrisc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-25-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/openrisc/kernel/traps.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index c11aa2e17ce05e..3b7978a22d6854 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -41,18 +41,26 @@ unsigned long __user *lwa_addr; void print_trace(void *data, unsigned long addr, int reliable) { - pr_emerg("[<%p>] %s%pS\n", (void *) addr, reliable ? "" : "? ", + const char *loglvl = data; + + printk("%s[<%p>] %s%pS\n", loglvl, (void *) addr, reliable ? "" : "? ", (void *) addr); } /* displays a short stack trace */ -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack_loglvl(struct task_struct *task, unsigned long *esp, + const char *loglvl) { if (esp == NULL) esp = (unsigned long *)&esp; - pr_emerg("Call trace:\n"); - unwind_stack(NULL, esp, print_trace); + printk("%sCall trace:\n", loglvl); + unwind_stack((void *)loglvl, esp, print_trace); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + show_stack_loglvl(task, esp, KERN_EMERG); } void show_registers(struct pt_regs *regs) From 0f6cec500c4d56afa9f4e36a869d8e44f4b79203 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:33 +1000 Subject: [PATCH 051/241] parisc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-26-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Helge Deller Cc: "James E.J. Bottomley" Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/parisc/kernel/traps.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 82fc0118948898..c2411de3730f2c 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -49,7 +49,7 @@ #include "../math-emu/math-emu.h" /* for handle_fpe() */ static void parisc_show_stack(struct task_struct *task, - struct pt_regs *regs); + struct pt_regs *regs, const char *loglvl); static int printbinary(char *buf, unsigned long x, int nbits) { @@ -155,7 +155,7 @@ void show_regs(struct pt_regs *regs) printk("%s IAOQ[1]: %pS\n", level, (void *) regs->iaoq[1]); printk("%s RP(r2): %pS\n", level, (void *) regs->gr[2]); - parisc_show_stack(current, regs); + parisc_show_stack(current, regs, KERN_DEFAULT); } } @@ -170,37 +170,43 @@ static DEFINE_RATELIMIT_STATE(_hppa_rs, } -static void do_show_stack(struct unwind_frame_info *info) +static void do_show_stack(struct unwind_frame_info *info, const char *loglvl) { int i = 1; - printk(KERN_CRIT "Backtrace:\n"); + printk("%sBacktrace:\n", loglvl); while (i <= MAX_UNWIND_ENTRIES) { if (unwind_once(info) < 0 || info->ip == 0) break; if (__kernel_text_address(info->ip)) { - printk(KERN_CRIT " [<" RFMT ">] %pS\n", - info->ip, (void *) info->ip); + printk("%s [<" RFMT ">] %pS\n", + loglvl, info->ip, (void *) info->ip); i++; } } - printk(KERN_CRIT "\n"); + printk("%s\n", loglvl); } static void parisc_show_stack(struct task_struct *task, - struct pt_regs *regs) + struct pt_regs *regs, const char *loglvl) { struct unwind_frame_info info; unwind_frame_init_task(&info, task, regs); - do_show_stack(&info); + do_show_stack(&info, loglvl); +} + +void show_stack_loglvl(struct task_struct *t, unsigned long *sp, + const char *loglvl) +{ + parisc_show_stack(t, NULL, loglvl); } void show_stack(struct task_struct *t, unsigned long *sp) { - parisc_show_stack(t, NULL); + show_stack_loglvl(t, sp, KERN_CRIT) } int is_valid_bugaddr(unsigned long iaoq) @@ -446,7 +452,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o /* show_stack(NULL, (unsigned long *)regs->gr[30]); */ struct unwind_frame_info info; unwind_frame_init(&info, current, regs); - do_show_stack(&info); + do_show_stack(&info, KERN_CRIT); } printk("\n"); From b99f56b91aebe1e3b99590397117d2d3ad0556eb Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:33 +1000 Subject: [PATCH 052/241] powerpc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-27-dima@arista.com Signed-off-by: Dmitry Safonov Acked-by: Michael Ellerman (powerpc) Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/process.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 048d64c4e11582..a456b4454b3fd0 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2063,7 +2063,8 @@ unsigned long get_wchan(struct task_struct *p) static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack(struct task_struct *tsk, unsigned long *stack) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; @@ -2088,7 +2089,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) } lr = 0; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) break; @@ -2097,7 +2098,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; if (!firstframe || ip != lr) { - printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); + printk("%s["REG"] ["REG"] %pS", + loglvl, sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER ret_addr = ftrace_graph_ret_addr(current, &ftrace_idx, ip, stack); @@ -2119,8 +2121,9 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) struct pt_regs *regs = (struct pt_regs *) (sp + STACK_FRAME_OVERHEAD); lr = regs->link; - printk("--- interrupt: %lx at %pS\n LR = %pS\n", - regs->trap, (void *)regs->nip, (void *)lr); + printk("%s--- interrupt: %lx at %pS\n LR = %pS\n", + loglvl, regs->trap, + (void *)regs->nip, (void *)lr); firstframe = 1; } @@ -2130,6 +2133,11 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) put_task_stack(tsk); } +void show_stack(struct task_struct *tsk, unsigned long *stack) +{ + show_stack_loglvl(tsk, stack, KERN_DEFAULT); +} + #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) From 038af44a18ee399d660c898777533c58e79353ca Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:34 +1000 Subject: [PATCH 053/241] riscv: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-28-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Albert Ou Cc: Palmer Dabbelt Cc: Paul Walmsley Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/riscv/kernel/stacktrace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 9f1ac258482fd7..aaa64bf007f820 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -99,16 +99,23 @@ void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(KERN_DEFAULT, pc); + const char *loglvl = arg; + + print_ip_sym(loglvl, pc); return false; } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, NULL); + walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); +} static bool save_wchan(unsigned long pc, void *arg) { From 361e592b7c876e50e6a2d56ea347103a61bb45f5 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:35 +1000 Subject: [PATCH 054/241] s390: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-29-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/s390/kernel/dumpstack.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 2c122d8bab9356..887a054919fc1e 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -126,18 +126,24 @@ int get_stack_info(unsigned long sp, struct task_struct *task, return -EINVAL; } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { struct unwind_state state; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) - printk(state.reliable ? " [<%016lx>] %pSR \n" : - "([<%016lx>] %pSR)\n", - state.ip, (void *) state.ip); + printk(state.reliable ? "%s [<%016lx>] %pSR \n" : + "%s([<%016lx>] %pSR)\n", + loglvl, state.ip, (void *) state.ip); debug_show_held_locks(task ? : current); } +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_DEFAULT); +} + static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); From 613192a5fc3995ee2c4682a64699bf121d41e625 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:36 +1000 Subject: [PATCH 055/241] sh: add loglvl to dump_mem() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_mem() as a preparation to introduce show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-30-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/kdebug.h | 3 ++- arch/sh/kernel/dumpstack.c | 17 +++++++++-------- arch/sh/kernel/traps.c | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/sh/include/asm/kdebug.h b/arch/sh/include/asm/kdebug.h index 5212f5fcd75201..de8693fabb1d95 100644 --- a/arch/sh/include/asm/kdebug.h +++ b/arch/sh/include/asm/kdebug.h @@ -13,6 +13,7 @@ enum die_val { /* arch/sh/kernel/dumpstack.c */ extern void printk_address(unsigned long address, int reliable); -extern void dump_mem(const char *str, unsigned long bottom, unsigned long top); +extern void dump_mem(const char *str, const char *loglvl, + unsigned long bottom, unsigned long top); #endif /* __ASM_SH_KDEBUG_H */ diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 9f1c9c11d62df4..6784b914fba010 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -16,30 +16,31 @@ #include #include -void dump_mem(const char *str, unsigned long bottom, unsigned long top) +void dump_mem(const char *str, const char *loglvl, + unsigned long bottom, unsigned long top) { unsigned long p; int i; - printk("%s(0x%08lx to 0x%08lx)\n", str, bottom, top); + printk("%s%s(0x%08lx to 0x%08lx)\n", loglvl, str, bottom, top); for (p = bottom & ~31; p < top; ) { - printk("%04lx: ", p & 0xffff); + printk("%s%04lx: ", loglvl, p & 0xffff); for (i = 0; i < 8; i++, p += 4) { unsigned int val; if (p < bottom || p >= top) - printk(" "); + printk("%s ", loglvl); else { if (__get_user(val, (unsigned int __user *)p)) { - printk("\n"); + printk("%s\n", loglvl); return; } - printk("%08x ", val); + printk("%s%08x ", loglvl, val); } } - printk("\n"); + printk("%s\n", loglvl); } } @@ -156,7 +157,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) sp = (unsigned long *)tsk->thread.sp; stack = (unsigned long)sp; - dump_mem("Stack: ", stack, THREAD_SIZE + + dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL); } diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 2130381c9d5749..a33025451fcd09 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -38,8 +38,8 @@ void die(const char *str, struct pt_regs *regs, long err) task_pid_nr(current), task_stack_page(current) + 1); if (!user_mode(regs) || in_interrupt()) - dump_mem("Stack: ", regs->regs[15], THREAD_SIZE + - (unsigned long)task_stack_page(current)); + dump_mem("Stack: ", KERN_DEFAULT, regs->regs[15], + THREAD_SIZE + (unsigned long)task_stack_page(current)); notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV); From cb90e47762c668d44ecf29dac81fe4c3fcfef920 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:36 +1000 Subject: [PATCH 056/241] sh: remove needless printk() Currently `data' is always an empty line "". No need for additional printk() call. Link: http://lkml.kernel.org/r/20200418201944.482088-31-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/kernel/dumpstack.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 6784b914fba010..2c1a78e5776b81 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -118,7 +118,6 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk("%s", (char *)data); printk_address(addr, reliable); } From 75e26a4e4b6fd212d488f56330169cbb534440f2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:37 +1000 Subject: [PATCH 057/241] sh: add loglvl to printk_address() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to printk_address() as a preparation to introduce show_stack_loglvl(). As a good side-effect show_fault_oops() now prints the address with KERN_EMREG as the rest of output, making sure there won't be situation where "PC: " is printed without actual address. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-32-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/kdebug.h | 3 ++- arch/sh/kernel/dumpstack.c | 6 +++--- arch/sh/mm/fault.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/sh/include/asm/kdebug.h b/arch/sh/include/asm/kdebug.h index de8693fabb1d95..960545306afa4d 100644 --- a/arch/sh/include/asm/kdebug.h +++ b/arch/sh/include/asm/kdebug.h @@ -12,7 +12,8 @@ enum die_val { }; /* arch/sh/kernel/dumpstack.c */ -extern void printk_address(unsigned long address, int reliable); +extern void printk_address(unsigned long address, int reliable, + const char *loglvl); extern void dump_mem(const char *str, const char *loglvl, unsigned long bottom, unsigned long top); diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 2c1a78e5776b81..959064b9005550 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -44,9 +44,9 @@ void dump_mem(const char *str, const char *loglvl, } } -void printk_address(unsigned long address, int reliable) +void printk_address(unsigned long address, int reliable, const char *loglvl) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk("%s [<%p>] %s%pS\n", loglvl, (void *) address, reliable ? "" : "? ", (void *) address); } @@ -118,7 +118,7 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk_address(addr, reliable); + printk_address(addr, reliable, (char *)data); } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 5f23d790759766..f5da8f5ea3899c 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -196,7 +196,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long address) printk(KERN_CONT " at %08lx\n", address); printk(KERN_ALERT "PC:"); - printk_address(regs->pc, 1); + printk_address(regs->pc, 1, KERN_ALERT); show_pte(NULL, address); } From b1ec620fa2d559aab0f0a75bd7d9791ae1972fcc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:38 +1000 Subject: [PATCH 058/241] sh: add loglvl to show_trace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to show_trace() as a preparation to introduce show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-33-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/processor_32.h | 2 +- arch/sh/kernel/dumpstack.c | 10 +++++----- arch/sh/kernel/process_32.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 0e0ecc0132e3b1..d44409413418a5 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -171,7 +171,7 @@ static __inline__ void enable_fpu(void) #define thread_saved_pc(tsk) (tsk->thread.pc) void show_trace(struct task_struct *tsk, unsigned long *sp, - struct pt_regs *regs); + struct pt_regs *regs, const char *loglvl); #ifdef CONFIG_DUMP_CODE void show_code(struct pt_regs *regs); diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 959064b9005550..d488a47a1f0fcf 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -127,16 +127,16 @@ static const struct stacktrace_ops print_trace_ops = { }; void show_trace(struct task_struct *tsk, unsigned long *sp, - struct pt_regs *regs) + struct pt_regs *regs, const char *loglvl) { if (regs && user_mode(regs)) return; - printk("\nCall trace:\n"); + printk("%s\nCall trace:\n", loglvl); - unwind_stack(tsk, regs, sp, &print_trace_ops, ""); + unwind_stack(tsk, regs, sp, &print_trace_ops, (void *)loglvl); - printk("\n"); + printk("%s\n", loglvl); if (!tsk) tsk = current; @@ -158,5 +158,5 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) stack = (unsigned long)sp; dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - show_trace(tsk, sp, NULL); + show_trace(tsk, sp, NULL, KERN_DEFAULT); } diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a094633874c373..456cc8d171f725 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -59,7 +59,7 @@ void show_regs(struct pt_regs * regs) printk("MACH: %08lx MACL: %08lx GBR : %08lx PR : %08lx\n", regs->mach, regs->macl, regs->gbr, regs->pr); - show_trace(NULL, (unsigned long *)regs->regs[15], regs); + show_trace(NULL, (unsigned long *)regs->regs[15], regs, KERN_DEFAULT); show_code(regs); } From d00eb6cbb8644efac3df5873c6334afa48aefb23 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:39 +1000 Subject: [PATCH 059/241] sh: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-34-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/kernel/dumpstack.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index d488a47a1f0fcf..cc51e9d7466748 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -144,7 +144,8 @@ void show_trace(struct task_struct *tsk, unsigned long *sp, debug_show_held_locks(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { unsigned long stack; @@ -156,7 +157,12 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) sp = (unsigned long *)tsk->thread.sp; stack = (unsigned long)sp; - dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + + dump_mem("Stack: ", loglvl, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - show_trace(tsk, sp, NULL, KERN_DEFAULT); + show_trace(tsk, sp, NULL, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } From 1dcfe2a52d73ced18ad45a2fa9dc0b7e522481d0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:40 +1000 Subject: [PATCH 060/241] sparc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-35-dima@arista.com Signed-off-by: Dmitry Safonov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sparc/kernel/process_32.c | 17 ++++++++++++----- arch/sparc/kernel/traps_64.c | 15 +++++++++++---- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 26cca65e92465d..0b07de5618e50b 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -145,10 +145,12 @@ void show_regs(struct pt_regs *r) } /* - * The show_stack is an external API which we do not use ourselves. + * The show_stack(), show_stack_loglvl() are external APIs which + * we do not use ourselves. * The oops is printed in die_if_kernel. */ -void show_stack(struct task_struct *tsk, unsigned long *_ksp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, + const char *loglvl) { unsigned long pc, fp; unsigned long task_base; @@ -170,11 +172,16 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) break; rw = (struct reg_window32 *) fp; pc = rw->ins[7]; - printk("[%08lx : ", pc); - printk("%pS ] ", (void *) pc); + printk("%s[%08lx : ", loglvl, pc); + printk("%s%pS ] ", loglvl, (void *) pc); fp = rw->ins[6]; } while (++count < 16); - printk("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } /* diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 27778b65a965e2..8715bc93bd9d83 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -2452,7 +2453,8 @@ static void user_instruction_dump(unsigned int __user *pc) printk("\n"); } -void show_stack(struct task_struct *tsk, unsigned long *_ksp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, + const char *loglvl) { unsigned long fp, ksp; struct thread_info *tp; @@ -2476,7 +2478,7 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) fp = ksp + STACK_BIAS; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { struct sparc_stackf *sf; struct pt_regs *regs; @@ -2497,14 +2499,14 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) fp = (unsigned long)sf->fp + STACK_BIAS; } - printk(" [%016lx] %pS\n", pc, (void *) pc); + print_ip_sym(loglvl, pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if ((pc + 8UL) == (unsigned long) &return_to_handler) { struct ftrace_ret_stack *ret_stack; ret_stack = ftrace_graph_get_ret_stack(tsk, graph); if (ret_stack) { pc = ret_stack->ret; - printk(" [%016lx] %pS\n", pc, (void *) pc); + print_ip_sym(loglvl, pc); graph++; } } @@ -2512,6 +2514,11 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) } while (++count < 16); } +void show_stack(struct task_struct *tsk, unsigned long *_ksp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); +} + static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; From 7a996776bb81ed46757bb0368ccfb13bf3cd8e4d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:41 +1000 Subject: [PATCH 061/241] um/sysrq: remove needless variable sp `sp' is a needless excercise here. Link: http://lkml.kernel.org/r/20200418201944.482088-36-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/um/kernel/sysrq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index c71b5ef7ea8c3a..c831a1c2eb94a7 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -27,7 +27,6 @@ static const struct stacktrace_ops stackops = { void show_stack(struct task_struct *task, unsigned long *stack) { - unsigned long *sp = stack; struct pt_regs *segv_regs = current->thread.segv_regs; int i; @@ -38,10 +37,9 @@ void show_stack(struct task_struct *task, unsigned long *stack) } if (!stack) - sp = get_stack_pointer(task, segv_regs); + stack = get_stack_pointer(task, segv_regs); pr_info("Stack:\n"); - stack = sp; for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; From 5aa63e9055829c8b5da08f2a0c92a7ca762656ad Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:41 +1000 Subject: [PATCH 062/241] um: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-37-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/um/kernel/sysrq.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index c831a1c2eb94a7..1b54b6431499b5 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -17,7 +17,9 @@ static void _print_addr(void *data, unsigned long address, int reliable) { - pr_info(" [<%08lx>] %s%pS\n", address, reliable ? "" : "? ", + const char *loglvl = data; + + printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ", (void *)address); } @@ -25,7 +27,8 @@ static const struct stacktrace_ops stackops = { .address = _print_addr }; -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { struct pt_regs *segv_regs = current->thread.segv_regs; int i; @@ -39,17 +42,22 @@ void show_stack(struct task_struct *task, unsigned long *stack) if (!stack) stack = get_stack_pointer(task, segv_regs); - pr_info("Stack:\n"); + printk("%sStack:\n", loglvl); for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); + printk("%s\n", loglvl); pr_cont(" %08lx", *stack++); } - pr_cont("\n"); + printk("%s\n", loglvl); + + printk("%sCall Trace:\n", loglvl); + dump_trace(current, &stackops, (void *)loglvl); + printk("%s\n", loglvl); +} - pr_info("Call Trace:\n"); - dump_trace(current, &stackops, NULL); - pr_info("\n"); +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_INFO); } From 3ff0e861204a777aa07bc865057d72f68c866de8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:42 +1000 Subject: [PATCH 063/241] unicore32: remove unused pmode argument in c_backtrace() The pmode parameter isn't used in assembly - remove it. Second argument will be reused for printk() log level. Link: http://lkml.kernel.org/r/20200418201944.482088-38-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/unicore32/kernel/setup.h | 2 +- arch/unicore32/kernel/traps.c | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index e40d3603c7e78f..03e70e37f47260 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -29,7 +29,7 @@ extern void kernel_thread_helper(void); extern void __init early_signal_init(void); extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp); extern void __show_regs(struct pt_regs *); diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index e24f67283864cd..3682a4c5d9274e 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -137,7 +137,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { - unsigned int fp, mode; + unsigned int fp; int ok = 1; printk(KERN_DEFAULT "Backtrace: "); @@ -145,16 +145,12 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; - if (regs) { + if (regs) fp = regs->UCreg_fp; - mode = processor_mode(regs); - } else if (tsk != current) { + else if (tsk != current) fp = thread_saved_fp(tsk); - mode = 0x10; - } else { + else asm("mov %0, fp" : "=r" (fp) : : "cc"); - mode = 0x10; - } if (!fp) { printk("no frame pointer"); @@ -167,7 +163,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) printk("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp); } void show_stack(struct task_struct *tsk, unsigned long *sp) From f04ce31230af9be56cf4dd8d7db7e467799047dd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:43 +1000 Subject: [PATCH 064/241] unicore32: add loglvl to c_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to c_backtrace() as a preparation for introducing show_stack_loglvl() [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-39-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/unicore32/kernel/setup.h | 2 +- arch/unicore32/kernel/traps.c | 2 +- arch/unicore32/lib/backtrace.S | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index 03e70e37f47260..96735232318533 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -29,7 +29,7 @@ extern void kernel_thread_helper(void); extern void __init early_signal_init(void); extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp); +extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl); extern void __show_regs(struct pt_regs *); diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 3682a4c5d9274e..2b7d734568659d 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -163,7 +163,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) printk("\n"); if (ok) - c_backtrace(fp); + c_backtrace(fp, KERN_DEFAULT); } void show_stack(struct task_struct *tsk, unsigned long *sp) diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S index f303671e2a4e9a..6221944b81f37c 100644 --- a/arch/unicore32/lib/backtrace.S +++ b/arch/unicore32/lib/backtrace.S @@ -16,6 +16,7 @@ #define sv_fp v5 #define sv_pc v6 #define offset v8 +#define loglvl v9 ENTRY(__backtrace) mov r0, fp @@ -27,10 +28,11 @@ ENTRY(c_backtrace) ENDPROC(__backtrace) ENDPROC(c_backtrace) #else - stm.w (v4 - v8, lr), [sp-] @ Save an extra register + stm.w (v4 - v10, lr), [sp-] @ Save an extra register @ so we have a location... mov.a frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r1 1: stm.w (pc), [sp-] @ calculate offset of PC stored ldw.w r0, [sp]+, #4 @ by stmfd for this CPU @@ -95,9 +97,10 @@ for_each_frame: bua for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame b.l printk -no_frame: ldm.w (v4 - v8, pc), [sp]+ +no_frame: ldm.w (v4 - v10, pc), [sp]+ ENDPROC(__backtrace) ENDPROC(c_backtrace) @@ -128,8 +131,11 @@ ENDPROC(c_backtrace) add v7, v7, #1 cxor.a v7, #6 cmoveq v7, #1 - cmoveq r1, #'\n' - cmovne r1, #' ' + bne 201f + adr r0, .Lcr + mov r1, loglvl + b.l printk +201: ldw.w r3, [stack]+, #-4 mov r2, reg csub.a r2, #8 @@ -141,18 +147,20 @@ ENDPROC(c_backtrace) add r2, r2, #0x10 @ so r2 need add 16 201: adr r0, .Lfp + mov r1, loglvl b.l printk 2: sub.a reg, reg, #1 bns 1b cxor.a v7, #0 beq 201f adr r0, .Lcr + mov r1, loglvl b.l printk 201: ldm.w (instr, reg, stack, v7, pc), [sp]+ -.Lfp: .asciz "%cr%d:%08x" -.Lcr: .asciz "\n" -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lfp: .asciz "%sr%d:%08x " +.Lcr: .asciz "%s\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0x92eec000 >> 14 @ stm.w sp, (... fp, ip, lr, pc) .word 0x92e10000 >> 14 @ stm.w sp, () From eaa7d2140a85c96f2239570060ea7308522c4bea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:43 +1000 Subject: [PATCH 065/241] unicore32: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a nice side-effect - print backtrace in __die() with the same log level as the rest of function. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-40-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/unicore32/kernel/traps.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 2b7d734568659d..8b1335997f5025 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -135,12 +135,13 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) set_fs(fs); } -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp; int ok = 1; - printk(KERN_DEFAULT "Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -153,25 +154,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) asm("mov %0, fp" : "=r" (fp) : : "cc"); if (!fp) { - printk("no frame pointer"); + printk("%sno frame pointer", loglvl); ok = 0; } else if (verify_stack(fp)) { - printk("invalid frame pointer 0x%08x", fp); + printk("%sinvalid frame pointer 0x%08x", loglvl, fp); ok = 0; } else if (fp < (unsigned long)end_of_stack(tsk)) - printk("frame pointer underflow"); - printk("\n"); + printk("%sframe pointer underflow", loglvl); + printk("%s\n", loglvl); if (ok) - c_backtrace(fp, KERN_DEFAULT); + c_backtrace(fp, loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, loglvl); barrier(); } +void show_stack(struct task_struct *tsk, unsigned long *sp) +{ + show_stack_loglvl(tsk, sp, KERN_DEFAULT) +} + static int __die(const char *str, int err, struct thread_info *thread, struct pt_regs *regs) { @@ -196,7 +203,7 @@ static int __die(const char *str, int err, struct thread_info *thread, if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } From c8632dbcd0f75a27809d79244ea1cac10e0185a3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:44 +1000 Subject: [PATCH 066/241] x86: add missing const qualifiers for log_lvl Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Keep log_lvl const show_trace_log_lvl() and printk_stack_address() as the new generic show_stack_loglvl() wants to have a proper const qualifier. And gcc rightfully produces warnings in case it's not keept: arch/x86/kernel/dumpstack.c: In function `show_stack': arch/x86/kernel/dumpstack.c:294:37: warning: passing argument 4 of `show_trace_log_lv ' discards `const' qualifier from pointer target type [-Wdiscarded-qualifiers] 294 | show_trace_log_lvl(task, NULL, sp, loglvl); | ^~~~~~ arch/x86/kernel/dumpstack.c:163:32: note: expected `char *' but argument is of type `const char *' 163 | unsigned long *stack, char *log_lvl) | ~~~~~~^~~~~~~ [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-41-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/include/asm/stacktrace.h | 2 +- arch/x86/kernel/dumpstack.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 14db05086bbf27..5ae5a68e469d3d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -87,7 +87,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, const char *log_lvl); /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f40..b94bc31a1757bb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); @@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; From 3b0c7d992cb5041cdfdc1ff677a483894ccad072 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:45 +1000 Subject: [PATCH 067/241] x86: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-42-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/dumpstack.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b94bc31a1757bb..4396f2cfad1972 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, } } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { task = task ? : current; @@ -290,7 +291,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) From 09be89346b7de96da3ffa3bebc4d714befdb12c5 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:45 +1000 Subject: [PATCH 068/241] xtensa: add loglvl to show_trace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to show_trace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-43-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Chris Zankel Cc: Max Filippov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/xtensa/kernel/traps.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 0976e27b8d5dad..c397a02457bca1 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -479,18 +479,22 @@ void show_regs(struct pt_regs * regs) static int show_trace_cb(struct stackframe *frame, void *data) { + const char *loglvl = data; + if (kernel_text_address(frame->pc)) - pr_cont(" [<%08lx>] %pB\n", frame->pc, (void *)frame->pc); + printk("%s [<%08lx>] %pB\n", + loglvl, frame->pc, (void *)frame->pc); return 0; } -void show_trace(struct task_struct *task, unsigned long *sp) +static void show_trace(struct task_struct *task, unsigned long *sp, + const char *loglvl) { if (!sp) sp = stack_pointer(task); - pr_info("Call Trace:\n"); - walk_stackframe(sp, show_trace_cb, NULL); + printk("%sCall Trace:\n", loglvl); + walk_stackframe(sp, show_trace_cb, (void *)loglvl); } #define STACK_DUMP_ENTRY_SIZE 4 @@ -511,7 +515,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, sp, len, false); - show_trace(task, sp); + show_trace(task, stack, KERN_INFO); } DEFINE_SPINLOCK(die_lock); From f84bdee00d7147c525a17835647d076fd46c6f36 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:45:46 +1000 Subject: [PATCH 069/241] xtensa-add-loglvl-to-show_trace-fix build fix Link: http://lkml.kernel.org/r/20200511194534.GA1018386@kernel.org Cc: Dmitry Safonov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/xtensa/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index c397a02457bca1..5013b62a9943f5 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -515,7 +515,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, sp, len, false); - show_trace(task, stack, KERN_INFO); + show_trace(task, sp, KERN_INFO); } DEFINE_SPINLOCK(die_lock); From 0900f8747e1ea86c32730b98204831a4ed6724b7 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:47 +1000 Subject: [PATCH 070/241] xtensa: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Link: http://lkml.kernel.org/r/20200418201944.482088-44-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Chris Zankel Cc: Max Filippov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/xtensa/kernel/traps.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 5013b62a9943f5..1013acc2e03ea1 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -501,7 +501,8 @@ static void show_trace(struct task_struct *task, unsigned long *sp, #define STACK_DUMP_LINE_SIZE 32 static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { size_t len; @@ -511,11 +512,16 @@ void show_stack(struct task_struct *task, unsigned long *sp) len = min((-(size_t)sp) & (THREAD_SIZE - STACK_DUMP_ENTRY_SIZE), kstack_depth_to_print * STACK_DUMP_ENTRY_SIZE); - pr_info("Stack:\n"); - print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, + printk("%sStack:\n", loglvl); + print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, sp, len, false); - show_trace(task, sp, KERN_INFO); + show_trace(task, sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_INFO); } DEFINE_SPINLOCK(die_lock); From 24e49ae3f03ff6a418ff922d2bb767dd4b1b1808 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:47 +1000 Subject: [PATCH 071/241] sysrq: use show_stack_loglvl() Show the stack trace on a CPU with the same log level as "CPU%d" header. Link: http://lkml.kernel.org/r/20200418201944.482088-45-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Greg Kroah-Hartman Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 477cdc1e9cf316..7bd935379dece4 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 95fb9e025247ef..373e4e3faf2ad9 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -31,6 +31,8 @@ extern void show_regs(struct pt_regs *); * trace (or NULL if the entire call-chain of the task should be shown). */ extern void show_stack(struct task_struct *task, unsigned long *sp); +extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); From 2396b235162097f87f1c32b3ac6564db439232f9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:48 +1000 Subject: [PATCH 072/241] x86/amd_gart: print stacktrace for a leak with KERN_ERR It's under CONFIG_IOMMU_LEAK option which is enabled by debug config. Likely the backtrace is worth to be seen - so aligning with log level of error message in iommu_full(). Link: http://lkml.kernel.org/r/20200418201944.482088-46-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/x86/kernel/amd_gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 16133819415c7d..9d2c076be37a66 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif From eba4319061cefdee45d1c7bfb0b49b8e49df1992 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:49 +1000 Subject: [PATCH 073/241] power: use show_stack_loglvl() Aligning with other watchdog messages just before panic - use KERN_EMERG. Link: http://lkml.kernel.org/r/20200418201944.482088-47-dima@arista.com Signed-off-by: Dmitry Safonov Acked-by: Rafael J. Wysocki Cc: Greg Kroah-Hartman Cc: Len Brown Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/base/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index bb98b813554fdd..be5af89bbff16a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -519,7 +519,7 @@ static void dpm_watchdog_handler(struct timer_list *t) struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); - show_stack(wd->tsk, NULL); + show_stack_loglvl(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } From dd5d2b5adb88075656a0811f5f034869b5552846 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:50 +1000 Subject: [PATCH 074/241] kdb: don't play with console_loglevel Print the stack trace with KERN_EMERG - it should be always visible. Playing with console_loglevel is a bad idea as there may be more messages printed than wanted. Also the stack trace might be not printed at all if printk() was deferred and console_loglevel was raised back before the trace got flushed. Unfortunately, after rebasing on commit 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master"), kdb_show_stack() uses now kdb_dump_stack_on_cpu(), which for now won't be converted as it uses dump_stack() instead of show_stack(). Convert for now the branch that uses show_stack() and remove console_loglevel exercise from that case. Link: http://lkml.kernel.org/r/20200418201944.482088-48-dima@arista.com Signed-off-by: Dmitry Safonov Reviewed-by: Douglas Anderson Acked-by: Daniel Thompson Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/debug/kdb/kdb_bt.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c164a..43f5dcd2b9ac7e 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + console_loglevel = old_lvl; + } else { + show_stack_loglvl(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } From d4998949280137f5f0b078456782302b777431f9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:51 +1000 Subject: [PATCH 075/241] sched: print stack trace with KERN_INFO Aligning with other messages printed in sched_show_task() - use KERN_INFO to print the backtrace. Link: http://lkml.kernel.org/r/20200418201944.482088-49-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Vincent Guittot Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06da3c3e317d4..c68a6e7b306feb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack_loglvl(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); From 8ec2014e52f18ca5dc08c83ff353a93815d59b3d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:51 +1000 Subject: [PATCH 076/241] kernel: use show_stack_loglvl() Align the last users of show_stack() by KERN_DEFAULT as the surrounding headers/messages. Link: http://lkml.kernel.org/r/20200418201944.482088-50-dima@arista.com Signed-off-by: Dmitry Safonov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- kernel/locking/rtmutex-debug.c | 2 +- lib/dump_stack.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458db..5e63d6e8a2230b 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack_loglvl(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 33ffbf30885394..5595e8962cf6bd 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -74,7 +74,7 @@ void show_regs_print_info(const char *log_lvl) static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_DEFAULT); } /** From 8fabe3171629f8a827788530ac424995f103b93d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 4 Jun 2020 11:45:52 +1000 Subject: [PATCH 077/241] kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/kernel/traps.c | 8 +------- arch/arc/kernel/stacktrace.c | 8 +------- arch/arm/kernel/traps.c | 8 +------- arch/arm64/kernel/traps.c | 8 +------- arch/c6x/kernel/traps.c | 7 +------ arch/csky/kernel/ptrace.c | 4 ++-- arch/csky/kernel/stacktrace.c | 9 +-------- arch/h8300/kernel/traps.c | 8 +------- arch/hexagon/kernel/traps.c | 8 +------- arch/ia64/kernel/mca.c | 2 +- arch/ia64/kernel/process.c | 11 ++--------- arch/m68k/kernel/traps.c | 11 +++-------- arch/microblaze/kernel/traps.c | 8 +------- arch/mips/kernel/traps.c | 8 +------- arch/nds32/kernel/traps.c | 8 +------- arch/nios2/kernel/traps.c | 12 +++--------- arch/openrisc/kernel/traps.c | 10 ++-------- arch/parisc/kernel/traps.c | 8 +------- arch/powerpc/kernel/process.c | 11 +++-------- arch/powerpc/kernel/stacktrace.c | 2 +- arch/riscv/kernel/stacktrace.c | 8 +------- arch/s390/kernel/dumpstack.c | 9 ++------- arch/sh/kernel/dumpstack.c | 8 +------- arch/sparc/kernel/process_32.c | 11 ++--------- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/traps_64.c | 8 +------- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/kernel/sysrq.c | 7 +------ arch/unicore32/kernel/traps.c | 7 +------ arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/dumpstack.c | 7 +------ arch/xtensa/kernel/traps.c | 10 ++-------- drivers/base/power/main.c | 2 +- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 5 ++--- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/locking/rtmutex-debug.c | 2 +- kernel/sched/core.c | 2 +- lib/dump_stack.c | 2 +- 39 files changed, 52 insertions(+), 205 deletions(-) diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 2402f1777f54ee..8383ccfaccdc23 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -144,8 +144,7 @@ dik_show_trace(unsigned long *sp, const char *loglvl) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long *stack; int i; @@ -174,11 +173,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, dik_show_trace(sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) { diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 24f9cd8a12c94f..feba91c9d969ca 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -228,17 +228,11 @@ noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { show_stacktrace(tsk, NULL, loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - /* Another API expected by schedular, shows up in "ps" as Wait Channel * Of course just returning schedule( ) would be pointless so unwind until * the function is not in schedular code diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 00455b5bbf8aeb..09faa0efe47b22 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -247,18 +247,12 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, } #endif -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3621868b2fcc81..3fd9e2731e0dd9 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -137,18 +137,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, put_task_stack(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index 4afbf48f1ce004..2b9121c755be17 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -374,7 +374,7 @@ static void show_trace(unsigned long *stack, unsigned long *endstack, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { unsigned long *p, *endstack; @@ -403,11 +403,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, endstack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEBUG); -} - int is_valid_bugaddr(unsigned long addr) { return __kernel_text_address(addr); diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 5a82230bddf988..bbd801f86eb57f 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -344,7 +344,7 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs) trace_sys_exit(regs, syscall_get_return_value(current, regs)); } -extern void show_stack(struct task_struct *task, unsigned long *stack); +extern void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl); void show_regs(struct pt_regs *fp) { unsigned long *sp; @@ -420,6 +420,6 @@ void show_regs(struct pt_regs *fp) } pr_cont("\n"); - show_stack(NULL, (unsigned long *)fp->regs[4]); + show_stack(NULL, (unsigned long *)fp->regs[4], KERN_INFO); return; } diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index ca135f13cc1383..16ae20a0af3421 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -95,19 +95,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, KERN_INFO); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 6362446563d6ab..5d8b969cd8f349 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -115,8 +115,7 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { unsigned long *stack, addr; int i; @@ -158,8 +157,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, } printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_INFO); -} diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index a8a3a210d7810c..904134b37232f3 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -175,18 +175,12 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } } -void show_stack_loglvl(struct task_struct *task, unsigned long *fp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *fp, const char *loglvl) { /* Saved link reg is one word above FP */ do_show_stack(task, fp, 0, loglvl); } -void show_stack(struct task_struct *task, unsigned long *fp) -{ - show_stack_loglvl(task, fp, 0, KERN_INFO); -} - int die(const char *str, struct pt_regs *regs, long err) { static struct { diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6fb54dfa1350da..2703f7795672df 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1631,7 +1631,7 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi if (read_trylock(&tasklist_lock)) { do_each_thread (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); + show_stack(t, NULL, KERN_DEFAULT); } while_each_thread (g, t); read_unlock(&tasklist_lock); } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 913d9a01cbf9b3..96dfb9e4b16fbd 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -85,8 +85,7 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) } void -show_stack_loglvl (struct task_struct *task, unsigned long *sp, - const char *loglvl) +show_stack (struct task_struct *task, unsigned long *sp, const char *loglvl) { if (!task) unw_init_running(ia64_do_show_stack, (void *)loglvl); @@ -98,12 +97,6 @@ show_stack_loglvl (struct task_struct *task, unsigned long *sp, } } -void -show_stack (struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_regs (struct pt_regs *regs) { @@ -158,7 +151,7 @@ show_regs (struct pt_regs *regs) ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); } } else - show_stack(NULL, NULL); + show_stack(NULL, NULL, KERN_DEFAULT); } /* local support for deprecated console_print */ diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index ffcc5ec4fac3b4..df6fc782754f73 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -916,7 +916,7 @@ void show_registers(struct pt_regs *regs) default: pr_cont("\n"); } - show_stack(NULL, (unsigned long *)addr); + show_stack(NULL, (unsigned long *)addr, KERN_INFO); pr_info("Code:"); set_fs(KERNEL_DS); @@ -935,8 +935,8 @@ void show_registers(struct pt_regs *regs) pr_cont("\n"); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p; unsigned long *endstack; @@ -963,11 +963,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} - /* * The vector number returned in the frame pointer may also contain * the "fs" (Fault Status) bits on ColdFire. These are in the bottom diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 149ae534937efb..94b6fe93147d5a 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -31,8 +31,7 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long words_to_show; u32 fp = (u32) sp; @@ -77,8 +76,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, debug_show_held_locks(task); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e49040739b61d5..320797bd03f642 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -198,8 +198,7 @@ static void show_stacktrace(struct task_struct *task, show_backtrace(task, regs, loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { struct pt_regs regs; mm_segment_t old_fs = get_fs(); @@ -227,11 +226,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, set_fs(old_fs); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT) -} - static void show_code(unsigned int __user *pc) { long i; diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 90f12582c218a7..6a9772ba739276 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -135,8 +135,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long *base_reg; @@ -157,11 +156,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_EMERG); -} - DEFINE_SPINLOCK(die_lock); /* diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 08071caa9b36fa..b172da4eb1a952 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -52,14 +52,13 @@ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) } /* - * The show_stack(), show_stack_loglvl() are external API - * which we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. */ int kstack_depth_to_print = 48; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *endstack, addr; int i; @@ -106,11 +105,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_EMERG); -} - void __init trap_init(void) { /* Nothing to do here */ diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 3b7978a22d6854..3022b0ad142cc1 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -48,8 +48,7 @@ void print_trace(void *data, unsigned long addr, int reliable) } /* displays a short stack trace */ -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { if (esp == NULL) esp = (unsigned long *)&esp; @@ -58,11 +57,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, unwind_stack((void *)loglvl, esp, print_trace); } -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_EMERG); -} - void show_registers(struct pt_regs *regs) { int i; @@ -104,7 +98,7 @@ void show_registers(struct pt_regs *regs) if (in_kernel) { printk("\nStack: "); - show_stack(NULL, (unsigned long *)esp); + show_stack(NULL, (unsigned long *)esp, KERN_EMERG); printk("\nCode: "); if (regs->pc < PAGE_OFFSET) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index c2411de3730f2c..0a89899f154a58 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -198,17 +198,11 @@ static void parisc_show_stack(struct task_struct *task, do_show_stack(&info, loglvl); } -void show_stack_loglvl(struct task_struct *t, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *t, unsigned long *sp, const char *loglvl) { parisc_show_stack(t, NULL, loglvl); } -void show_stack(struct task_struct *t, unsigned long *sp) -{ - show_stack_loglvl(t, sp, KERN_CRIT) -} - int is_valid_bugaddr(unsigned long iaoq) { return 1; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a456b4454b3fd0..99d619f81cb527 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1456,7 +1456,7 @@ void show_regs(struct pt_regs * regs) printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); #endif - show_stack(current, (unsigned long *) regs->gpr[1]); + show_stack(current, (unsigned long *) regs->gpr[1], KERN_DEFAULT); if (!user_mode(regs)) show_instructions(regs); } @@ -2063,8 +2063,8 @@ unsigned long get_wchan(struct task_struct *p) static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; @@ -2133,11 +2133,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, put_task_stack(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *stack) -{ - show_stack_loglvl(tsk, stack, KERN_DEFAULT); -} - #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index c477b8585a297a..b6440657ef92d0 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -260,7 +260,7 @@ static void raise_backtrace_ipi(cpumask_t *mask) pr_cont(" current pointer corrupt? (%px)\n", p->__current); pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly stale):\n", p->saved_r1); - show_stack(p->__current, (unsigned long *)p->saved_r1); + show_stack(p->__current, (unsigned long *)p->saved_r1, KERN_WARNING); } } diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index aaa64bf007f820..595342910c3f64 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -105,18 +105,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 887a054919fc1e..0dc4b258b98d51 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -126,7 +126,7 @@ int get_stack_info(unsigned long sp, struct task_struct *task, return -EINVAL; } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct unwind_state state; @@ -139,11 +139,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, debug_show_held_locks(task ? : current); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEFAULT); -} - static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); @@ -181,7 +176,7 @@ void show_regs(struct pt_regs *regs) show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ if (!user_mode(regs)) - show_stack(NULL, (unsigned long *) regs->gprs[15]); + show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT); show_last_breaking_event(regs); } diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index cc51e9d7466748..a13c045804ed10 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -144,8 +144,7 @@ void show_trace(struct task_struct *tsk, unsigned long *sp, debug_show_held_locks(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long stack; @@ -161,8 +160,3 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL, loglvl); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 0b07de5618e50b..65c0d5207b0cd9 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -145,12 +145,10 @@ void show_regs(struct pt_regs *r) } /* - * The show_stack(), show_stack_loglvl() are external APIs which - * we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. * The oops is printed in die_if_kernel. */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long pc, fp; unsigned long task_base; @@ -179,11 +177,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - /* * Free current thread data structures etc.. */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 423011e6098287..5a4d9c8f890336 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -195,7 +195,7 @@ void show_regs(struct pt_regs *regs) regs->u_regs[15]); printk("RPC: <%pS>\n", (void *) regs->u_regs[15]); show_regwindow(regs); - show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]); + show_stack(current, (unsigned long *)regs->u_regs[UREG_FP], KERN_DEFAULT); } union global_cpu_snapshot global_cpu_snapshot[NR_CPUS]; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 8715bc93bd9d83..96d92f1075514e 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2453,8 +2453,7 @@ static void user_instruction_dump(unsigned int __user *pc) printk("\n"); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long fp, ksp; struct thread_info *tp; @@ -2514,11 +2513,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, } while (++count < 16); } -void show_stack(struct task_struct *tsk, unsigned long *_ksp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 30575bd9297525..a2e680f7d39f25 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -648,7 +648,7 @@ static void stack_proc(void *arg) { struct task_struct *task = arg; - show_stack(task, NULL); + show_stack(task, NULL, KERN_INFO); } /* diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 1b54b6431499b5..acbc879d277336 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -27,7 +27,7 @@ static const struct stacktrace_ops stackops = { .address = _print_addr }; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct pt_regs *segv_regs = current->thread.segv_regs; @@ -56,8 +56,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, dump_trace(current, &stackops, (void *)loglvl); printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 8b1335997f5025..a3ac01df1a2e43 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -167,18 +167,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, c_backtrace(fp, loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT) -} - static int __die(const char *str, int err, struct thread_info *thread, struct pt_regs *regs) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 9d2c076be37a66..5f816861f5d203 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack_loglvl(NULL, NULL, KERN_ERR); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4396f2cfad1972..456511b2284eab 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, } } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { task = task ? : current; @@ -294,11 +294,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace_log_lvl(task, NULL, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_stack_regs(struct pt_regs *regs) { show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 1013acc2e03ea1..e880460741d2ba 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -501,8 +501,7 @@ static void show_trace(struct task_struct *task, unsigned long *sp, #define STACK_DUMP_LINE_SIZE 32 static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { size_t len; @@ -519,11 +518,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace(task, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} - DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) @@ -540,7 +534,7 @@ void die(const char * str, struct pt_regs * regs, long err) pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr); show_regs(regs); if (!user_mode(regs)) - show_stack(NULL, (unsigned long*)regs->areg[1]); + show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irq(&die_lock); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index be5af89bbff16a..9dd85bea40260a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -519,7 +519,7 @@ static void dpm_watchdog_handler(struct timer_list *t) struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); - show_stack_loglvl(wd->tsk, NULL, KERN_EMERG); + show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 7bd935379dece4..7c95afa905a083 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack_loglvl(NULL, NULL, KERN_INFO); + show_stack(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 373e4e3faf2ad9..00c45a0e6abe12 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -30,9 +30,8 @@ extern void show_regs(struct pt_regs *); * task), SP is the stack pointer of the first frame that should be shown in the back * trace (or NULL if the entire call-chain of the task should be shown). */ -extern void show_stack(struct task_struct *task, unsigned long *sp); -extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl); +extern void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 43f5dcd2b9ac7e..18e03aba2cfc7d 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -30,7 +30,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) kdb_dump_stack_on_cpu(kdb_process_cpu(p)); console_loglevel = old_lvl; } else { - show_stack_loglvl(p, addr, KERN_EMERG); + show_stack(p, addr, KERN_EMERG); } kdb_trap_printk--; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 5e63d6e8a2230b..36e69100e8e062 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack_loglvl(task, NULL, KERN_DEFAULT); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c68a6e7b306feb..8f360326861ec8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack_loglvl(p, NULL, KERN_INFO); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 5595e8962cf6bd..a00ee6eedc7c3c 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -74,7 +74,7 @@ void show_regs_print_info(const char *log_lvl) static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); - show_stack_loglvl(NULL, NULL, KERN_DEFAULT); + show_stack(NULL, NULL, KERN_DEFAULT); } /** From 6c0bb2a972b8aa7cfb1a0895983f638b24ec3b18 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:53 +1000 Subject: [PATCH 078/241] mm/frontswap: mark various intentional data races There are a few information counters that are intentionally not protected against increment races, so just annotate them using the data_race() macro. BUG: KCSAN: data-race in __frontswap_store / __frontswap_store write to 0xffffffff8b7174d8 of 8 bytes by task 6396 on cpu 103: __frontswap_store+0x2d0/0x344 inc_frontswap_failed_stores at mm/frontswap.c:70 (inlined by) __frontswap_store at mm/frontswap.c:280 swap_writepage+0x83/0xf0 pageout+0x33e/0xae0 shrink_page_list+0x1f57/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffffffff8b7174d8 of 8 bytes by task 6405 on cpu 47: __frontswap_store+0x2b9/0x344 inc_frontswap_failed_stores at mm/frontswap.c:70 (inlined by) __frontswap_store at mm/frontswap.c:280 swap_writepage+0x83/0xf0 pageout+0x33e/0xae0 shrink_page_list+0x1f57/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Link: http://lkml.kernel.org/r/1581114499-5042-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/frontswap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index bfa3a339253ef9..5c8c66bbedde5c 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -61,16 +61,16 @@ static u64 frontswap_failed_stores; static u64 frontswap_invalidates; static inline void inc_frontswap_loads(void) { - frontswap_loads++; + data_race(frontswap_loads++); } static inline void inc_frontswap_succ_stores(void) { - frontswap_succ_stores++; + data_race(frontswap_succ_stores++); } static inline void inc_frontswap_failed_stores(void) { - frontswap_failed_stores++; + data_race(frontswap_failed_stores++); } static inline void inc_frontswap_invalidates(void) { - frontswap_invalidates++; + data_race(frontswap_invalidates++); } #else static inline void inc_frontswap_loads(void) { } From 64bc91537c21b8e1e6278b89e6f1016d7180d3f5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:53 +1000 Subject: [PATCH 079/241] mm/page_io: mark various intentional data races struct swap_info_struct si.flags could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in scan_swap_map_slots / swap_readpage write to 0xffff9c77b80ac400 of 8 bytes by task 91325 on cpu 16: scan_swap_map_slots+0x6fe/0xb50 scan_swap_map_slots at mm/swapfile.c:887 get_swap_pages+0x39d/0x5c0 get_swap_page+0x377/0x524 add_to_swap+0xe4/0x1c0 shrink_page_list+0x1740/0x2820 shrink_inactive_list+0x316/0x8b0 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffff9c77b80ac400 of 8 bytes by task 5422 on cpu 7: swap_readpage+0x204/0x6a0 swap_readpage at mm/page_io.c:380 read_swap_cache_async+0xa2/0xb0 swapin_readahead+0x6a0/0x890 do_swap_page+0x465/0xeb0 __handle_mm_fault+0xc7a/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Reported by Kernel Concurrency Sanitizer on: CPU: 7 PID: 5422 Comm: gmain Tainted: G W O L 5.5.0-next-20200204+ #6 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 Other reads, read to 0xffff91ea33eac400 of 8 bytes by task 11276 on cpu 120: __swap_writepage+0x140/0xc20 __swap_writepage at mm/page_io.c:289 read to 0xffff91ea33eac400 of 8 bytes by task 11264 on cpu 16: swap_set_page_dirty+0x44/0x1f4 swap_set_page_dirty at mm/page_io.c:442 The write is under &si->lock, but the reads are done as lockless. Since the reads only check for a specific bit in the flag, it is harmless even if load tearing happens. Thus, just mark them as intentional data races using the data_race() macro. Link: http://lkml.kernel.org/r/20200207003601.1526-1-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 76965be1d40ea5..1ee5957deb8835 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -286,7 +286,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FS) { + if (data_race(sis->flags & SWP_FS)) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -377,7 +377,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FS) { + if (data_race(sis->flags & SWP_FS)) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -439,7 +439,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FS) { + if (data_race(sis->flags & SWP_FS)) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); From 077b02e80bf7f1927f44f753c5c740e58280048d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:54 +1000 Subject: [PATCH 080/241] mm-page_io-mark-various-intentional-data-races-v2 add a missing annotation Link: http://lkml.kernel.org/r/1581612585-5812-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index 1ee5957deb8835..26935db0676c42 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -86,7 +86,7 @@ static void swap_slot_free_notify(struct page *page) return; sis = page_swap_info(page); - if (!(sis->flags & SWP_BLKDEV)) + if (data_race(!(sis->flags & SWP_BLKDEV))) return; /* From 1401fcdac259aed97111cdeb000fd17410df60df Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:55 +1000 Subject: [PATCH 081/241] mm/swap_state: mark various intentional data races swap_cache_info.* could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in lookup_swap_cache / lookup_swap_cache write to 0xffffffff85517318 of 8 bytes by task 94138 on cpu 101: lookup_swap_cache+0x12e/0x460 lookup_swap_cache at mm/swap_state.c:322 do_swap_page+0x112/0xeb0 __handle_mm_fault+0xc7a/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffffffff85517318 of 8 bytes by task 91655 on cpu 100: lookup_swap_cache+0x117/0x460 lookup_swap_cache at mm/swap_state.c:322 shmem_swapin_page+0xc7/0x9e0 shmem_getpage_gfp+0x2ca/0x16c0 shmem_fault+0xef/0x3c0 __do_fault+0x9e/0x220 do_fault+0x4a0/0x920 __handle_mm_fault+0xc69/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Reported by Kernel Concurrency Sanitizer on: CPU: 100 PID: 91655 Comm: systemd-journal Tainted: G W O L 5.5.0-next-20200204+ #6 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 write to 0xffffffff8d717308 of 8 bytes by task 11365 on cpu 87: __delete_from_swap_cache+0x681/0x8b0 __delete_from_swap_cache at mm/swap_state.c:178 read to 0xffffffff8d717308 of 8 bytes by task 11275 on cpu 53: __delete_from_swap_cache+0x66e/0x8b0 __delete_from_swap_cache at mm/swap_state.c:178 Both the read and write are done as lockless. Since swap_cache_info.* are only used to print out counter information, even if any of them missed a few incremental due to data races, it will be harmless, so just mark it as an intentional data race using the data_race() macro. While at it, fix a checkpatch.pl warning, WARNING: Single statement macros should not use a do {} while (0) loop Link: http://lkml.kernel.org/r/20200207003715.1578-1-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/swap_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 9d20b00627af43..26113835db8b17 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -58,8 +58,8 @@ static bool enable_vma_readahead __read_mostly = true; #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) -#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) -#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) +#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) +#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) static struct { unsigned long add_total; From 2e89f20974cec6f4a8651434e03af5855c5ae95e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 4 Jun 2020 11:45:55 +1000 Subject: [PATCH 082/241] mm/filemap.c: fix a data race in filemap_fault() struct file_ra_state ra.mmap_miss could be accessed concurrently during page faults as noticed by KCSAN, BUG: KCSAN: data-race in filemap_fault / filemap_map_pages write to 0xffff9b1700a2c1b4 of 4 bytes by task 3292 on cpu 30: filemap_fault+0x920/0xfc0 do_sync_mmap_readahead at mm/filemap.c:2384 (inlined by) filemap_fault at mm/filemap.c:2486 __xfs_filemap_fault+0x112/0x3e0 [xfs] xfs_filemap_fault+0x74/0x90 [xfs] __do_fault+0x9e/0x220 do_fault+0x4a0/0x920 __handle_mm_fault+0xc69/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffff9b1700a2c1b4 of 4 bytes by task 3313 on cpu 32: filemap_map_pages+0xc2e/0xd80 filemap_map_pages at mm/filemap.c:2625 do_fault+0x3da/0x920 __handle_mm_fault+0xc69/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Reported by Kernel Concurrency Sanitizer on: CPU: 32 PID: 3313 Comm: systemd-udevd Tainted: G W L 5.5.0-next-20200210+ #1 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 ra.mmap_miss is used to contribute the readahead decisions, a data race could be undesirable. Both the read and write is only under non-exclusive mmap_sem, two concurrent writers could even underflow the counter. Fix the underflow by writing to a local variable before committing a final store to ra.mmap_miss given a small inaccuracy of the counter should be acceptable. Link: http://lkml.kernel.org/r/20200211030134.1847-1-cai@lca.pw Signed-off-by: Kirill A. Shutemov Signed-off-by: Qian Cai Tested-by: Qian Cai Reviewed-by: Matthew Wilcox (Oracle) Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/filemap.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c3629d76d6fcba..027b29bae09b2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2403,6 +2403,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file *fpin = NULL; pgoff_t offset = vmf->pgoff; + unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ) @@ -2418,14 +2419,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } /* Avoid banging the cache line if not needed */ - if (ra->mmap_miss < MMAP_LOTSAMISS * 10) - ra->mmap_miss++; + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) + if (mmap_miss > MMAP_LOTSAMISS) return fpin; /* @@ -2451,13 +2453,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; struct file *fpin = NULL; + unsigned int mmap_miss; pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; - if (ra->mmap_miss > 0) - ra->mmap_miss--; + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); if (PageReadahead(page)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, @@ -2623,6 +2627,7 @@ void filemap_map_pages(struct vm_fault *vmf, unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *page; + unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2659,8 +2664,8 @@ void filemap_map_pages(struct vm_fault *vmf, if (page->index >= max_idx) goto unlock; - if (file->f_ra.mmap_miss > 0) - file->f_ra.mmap_miss--; + if (mmap_miss > 0) + mmap_miss--; vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) @@ -2680,6 +2685,7 @@ void filemap_map_pages(struct vm_fault *vmf, break; } rcu_read_unlock(); + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); } EXPORT_SYMBOL(filemap_map_pages); From 0b44c55471b185768f4e4c6cae81058a1fc57755 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:56 +1000 Subject: [PATCH 083/241] mm/swapfile: fix and annotate various data races swap_info_struct si.highest_bit, si.swap_map[offset] and si.flags could be accessed concurrently separately as noticed by KCSAN, === si.highest_bit === write to 0xffff8d5abccdc4d4 of 4 bytes by task 5353 on cpu 24: swap_range_alloc+0x81/0x130 swap_range_alloc at mm/swapfile.c:681 scan_swap_map_slots+0x371/0xb90 get_swap_pages+0x39d/0x5c0 get_swap_page+0xf2/0x524 add_to_swap+0xe4/0x1c0 shrink_page_list+0x1795/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 read to 0xffff8d5abccdc4d4 of 4 bytes by task 6672 on cpu 70: scan_swap_map_slots+0x4a6/0xb90 scan_swap_map_slots at mm/swapfile.c:892 get_swap_pages+0x39d/0x5c0 get_swap_page+0xf2/0x524 add_to_swap+0xe4/0x1c0 shrink_page_list+0x1795/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 Reported by Kernel Concurrency Sanitizer on: CPU: 70 PID: 6672 Comm: oom01 Tainted: G W L 5.5.0-next-20200205+ #3 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 === si.swap_map[offset] === write to 0xffffbc370c29a64c of 1 bytes by task 6856 on cpu 86: __swap_entry_free_locked+0x8c/0x100 __swap_entry_free_locked at mm/swapfile.c:1209 (discriminator 4) __swap_entry_free.constprop.20+0x69/0xb0 free_swap_and_cache+0x53/0xa0 unmap_page_range+0x7f8/0x1d70 unmap_single_vma+0xcd/0x170 unmap_vmas+0x18b/0x220 exit_mmap+0xee/0x220 mmput+0x10e/0x270 do_exit+0x59b/0xf40 do_group_exit+0x8b/0x180 read to 0xffffbc370c29a64c of 1 bytes by task 6855 on cpu 20: _swap_info_get+0x81/0xa0 _swap_info_get at mm/swapfile.c:1140 free_swap_and_cache+0x40/0xa0 unmap_page_range+0x7f8/0x1d70 unmap_single_vma+0xcd/0x170 unmap_vmas+0x18b/0x220 exit_mmap+0xee/0x220 mmput+0x10e/0x270 do_exit+0x59b/0xf40 do_group_exit+0x8b/0x180 === si.flags === write to 0xffff956c8fc6c400 of 8 bytes by task 6087 on cpu 23: scan_swap_map_slots+0x6fe/0xb50 scan_swap_map_slots at mm/swapfile.c:887 get_swap_pages+0x39d/0x5c0 get_swap_page+0x377/0x524 add_to_swap+0xe4/0x1c0 shrink_page_list+0x1795/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 read to 0xffff956c8fc6c400 of 8 bytes by task 6207 on cpu 63: _swap_info_get+0x41/0xa0 __swap_info_get at mm/swapfile.c:1114 put_swap_page+0x84/0x490 __remove_mapping+0x384/0x5f0 shrink_page_list+0xff1/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 The writes are under si->lock but the reads are not. For si.highest_bit and si.swap_map[offset], data race could trigger logic bugs, so fix them by having WRITE_ONCE() for the writes and READ_ONCE() for the reads except those isolated reads where they compare against zero which a data race would cause no harm. Thus, annotate them as intentional data races using the data_race() macro. For si.flags, the readers are only interested in a single bit where a data race there would cause no issue there. Link: http://lkml.kernel.org/r/1581095163-12198-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/swapfile.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a3d191e205f2b9..c63875990093fe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -673,7 +673,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, if (offset == si->lowest_bit) si->lowest_bit += nr_entries; if (end == si->highest_bit) - si->highest_bit -= nr_entries; + WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); si->inuse_pages += nr_entries; if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; @@ -705,7 +705,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, if (end > si->highest_bit) { bool was_full = !si->highest_bit; - si->highest_bit = end; + WRITE_ONCE(si->highest_bit, end); if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } @@ -869,7 +869,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, else goto done; } - si->swap_map[offset] = usage; + WRITE_ONCE(si->swap_map[offset], usage); inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); @@ -928,12 +928,13 @@ static int scan_swap_map_slots(struct swap_info_struct *si, scan: spin_unlock(&si->lock); - while (++offset <= si->highest_bit) { - if (!si->swap_map[offset]) { + while (++offset <= READ_ONCE(si->highest_bit)) { + if (data_race(!si->swap_map[offset])) { spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full() && + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -945,11 +946,12 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } offset = si->lowest_bit; while (offset < scan_base) { - if (!si->swap_map[offset]) { + if (data_race(!si->swap_map[offset])) { spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full() && + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -1148,7 +1150,7 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) p = swp_swap_info(entry); if (!p) goto bad_nofile; - if (!(p->flags & SWP_USED)) + if (data_race(!(p->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= p->max) @@ -1174,7 +1176,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) p = __swap_info_get(entry); if (!p) goto out; - if (!p->swap_map[swp_offset(entry)]) + if (data_race(!p->swap_map[swp_offset(entry)])) goto bad_free; return p; @@ -1243,7 +1245,10 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, } usage = count | has_cache; - p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + if (usage) + WRITE_ONCE(p->swap_map[offset], usage); + else + WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); return usage; } @@ -1295,7 +1300,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) goto bad_nofile; rcu_read_lock(); - if (!(si->flags & SWP_VALID)) + if (data_race(!(si->flags & SWP_VALID))) goto unlock_out; offset = swp_offset(entry); if (offset >= si->max) @@ -3483,7 +3488,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) } else err = -ENOENT; /* unused swap entry */ - p->swap_map[offset] = count | has_cache; + WRITE_ONCE(p->swap_map[offset], count | has_cache); unlock_out: unlock_cluster_or_swap_info(p, ci); From e6ca08f8969682d6529bdf021b7a0d68a5c094e6 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:57 +1000 Subject: [PATCH 084/241] mm-swapfile-fix-and-annotate-various-data-races-v2 add a missing annotation for si->flags in memory.c Link: http://lkml.kernel.org/r/1581612647-5958-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 924c2b7a41768d..58a74acafcd0ac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3124,8 +3124,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!page) { struct swap_info_struct *si = swp_swap_info(entry); - if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(entry) == 1) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); From c1f8a715c2e92b031c164c3dff819461d6be2918 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:57 +1000 Subject: [PATCH 085/241] mm/page_counter: fix various data races at memsw Commit 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") could had memcg->memsw->watermark and memcg->memsw->failcnt been accessed concurrently as reported by KCSAN, BUG: KCSAN: data-race in page_counter_try_charge / page_counter_try_charge read to 0xffff8fb18c4cd190 of 8 bytes by task 1081 on cpu 59: page_counter_try_charge+0x4d/0x150 mm/page_counter.c:138 try_charge+0x131/0xd50 mm/memcontrol.c:2405 __memcg_kmem_charge_memcg+0x58/0x140 __memcg_kmem_charge+0xcc/0x280 __alloc_pages_nodemask+0x1e1/0x450 alloc_pages_current+0xa6/0x120 pte_alloc_one+0x17/0xd0 __pte_alloc+0x3a/0x1f0 copy_p4d_range+0xc36/0x1990 copy_page_range+0x21d/0x360 dup_mmap+0x5f5/0x7a0 dup_mm+0xa2/0x240 copy_process+0x1b3f/0x3460 _do_fork+0xaa/0xa20 __x64_sys_clone+0x13b/0x170 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe write to 0xffff8fb18c4cd190 of 8 bytes by task 1153 on cpu 120: page_counter_try_charge+0x5b/0x150 mm/page_counter.c:139 try_charge+0x131/0xd50 mm/memcontrol.c:2405 mem_cgroup_try_charge+0x159/0x460 mem_cgroup_try_charge_delay+0x3d/0xa0 wp_page_copy+0x14d/0x930 do_wp_page+0x107/0x7b0 __handle_mm_fault+0xce6/0xd40 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 BUG: KCSAN: data-race in page_counter_try_charge / page_counter_try_charge write to 0xffff88809bbf2158 of 8 bytes by task 11782 on cpu 0: page_counter_try_charge+0x100/0x170 mm/page_counter.c:129 try_charge+0x185/0xbf0 mm/memcontrol.c:2405 __memcg_kmem_charge_memcg+0x4a/0xe0 mm/memcontrol.c:2837 __memcg_kmem_charge+0xcf/0x1b0 mm/memcontrol.c:2877 __alloc_pages_nodemask+0x26c/0x310 mm/page_alloc.c:4780 read to 0xffff88809bbf2158 of 8 bytes by task 11814 on cpu 1: page_counter_try_charge+0xef/0x170 mm/page_counter.c:129 try_charge+0x185/0xbf0 mm/memcontrol.c:2405 __memcg_kmem_charge_memcg+0x4a/0xe0 mm/memcontrol.c:2837 __memcg_kmem_charge+0xcf/0x1b0 mm/memcontrol.c:2877 __alloc_pages_nodemask+0x26c/0x310 mm/page_alloc.c:4780 Since watermark could be compared or set to garbage due to a data race which would change the code logic, fix it by adding a pair of READ_ONCE() and WRITE_ONCE() in those places. The "failcnt" counter is tolerant of some degree of inaccuracy and is only used to report stats, a data race will not be harmful, thus mark it as an intentional data race using the data_race() macro. Link: http://lkml.kernel.org/r/1581519682-23594-1-git-send-email-cai@lca.pw Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") Signed-off-by: Qian Cai Reported-by: syzbot+f36cfe60b1006a94f9dc@syzkaller.appspotmail.com Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Tetsuo Handa Cc: Marco Elver Cc: Dmitry Vyukov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/page_counter.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index c56db2d5e15924..e3a205275a0b97 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -77,8 +77,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) * This is indeed racy, but we can live with some * inaccuracy in the watermark. */ - if (new > c->watermark) - c->watermark = new; + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); } } @@ -119,9 +119,10 @@ bool page_counter_try_charge(struct page_counter *counter, propagate_protected_usage(counter, new); /* * This is racy, but we can live with some - * inaccuracy in the failcnt. + * inaccuracy in the failcnt which is only used + * to report stats. */ - c->failcnt++; + data_race(c->failcnt++); *fail = c; goto failed; } @@ -130,8 +131,8 @@ bool page_counter_try_charge(struct page_counter *counter, * Just like with failcnt, we can live with some * inaccuracy in the watermark. */ - if (new > c->watermark) - c->watermark = new; + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); } return true; From b79031ebe25c032fa73b84889e319eb4a0791c0c Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:57 +1000 Subject: [PATCH 086/241] mm/memcontrol: fix a data race in scan count struct mem_cgroup_per_node mz.lru_zone_size[zone_idx][lru] could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in lruvec_lru_size / mem_cgroup_update_lru_size write to 0xffff9c804ca285f8 of 8 bytes by task 50951 on cpu 12: mem_cgroup_update_lru_size+0x11c/0x1d0 mem_cgroup_update_lru_size at mm/memcontrol.c:1266 isolate_lru_pages+0x6a9/0xf30 shrink_active_list+0x123/0xcc0 shrink_lruvec+0x8fd/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffff9c804ca285f8 of 8 bytes by task 50964 on cpu 95: lruvec_lru_size+0xbb/0x270 mem_cgroup_get_zone_lru_size at include/linux/memcontrol.h:536 (inlined by) lruvec_lru_size at mm/vmscan.c:326 shrink_lruvec+0x1d0/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_current+0xa6/0x120 alloc_slab_page+0x3b1/0x540 allocate_slab+0x70/0x660 new_slab+0x46/0x70 ___slab_alloc+0x4ad/0x7d0 __slab_alloc+0x43/0x70 kmem_cache_alloc+0x2c3/0x420 getname_flags+0x4c/0x230 getname+0x22/0x30 do_sys_openat2+0x205/0x3b0 do_sys_open+0x9a/0xf0 __x64_sys_openat+0x62/0x80 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reported by Kernel Concurrency Sanitizer on: CPU: 95 PID: 50964 Comm: cc1 Tainted: G W O L 5.5.0-next-20200204+ #6 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 The write is under lru_lock, but the read is done as lockless. The scan count is used to determine how aggressively the anon and file LRU lists should be scanned. Load tearing could generate an inefficient heuristic, so fix it by adding READ_ONCE() for the read. Link: http://lkml.kernel.org/r/20200206034945.2481-1-cai@lca.pw Signed-off-by: Qian Cai Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e77197a62809f3..bbf624a7f5a616 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -520,7 +520,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_zone_size[zone_idx][lru]; + return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(void); From a14616fc09ca1e2787e3de44102bb0fc3d021a43 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:58 +1000 Subject: [PATCH 087/241] mm/list_lru: fix a data race in list_lru_count_one struct list_lru_one l.nr_items could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in list_lru_count_one / list_lru_isolate_move write to 0xffffa102789c4510 of 8 bytes by task 823 on cpu 39: list_lru_isolate_move+0xf9/0x130 list_lru_isolate_move at mm/list_lru.c:180 inode_lru_isolate+0x12b/0x2a0 __list_lru_walk_one+0x122/0x3d0 list_lru_walk_one+0x75/0xa0 prune_icache_sb+0x8b/0xc0 super_cache_scan+0x1b8/0x250 do_shrink_slab+0x256/0x6d0 shrink_slab+0x41b/0x4a0 shrink_node+0x35c/0xd80 balance_pgdat+0x652/0xd90 kswapd+0x396/0x8d0 kthread+0x1e0/0x200 ret_from_fork+0x27/0x50 read to 0xffffa102789c4510 of 8 bytes by task 6345 on cpu 56: list_lru_count_one+0x116/0x2f0 list_lru_count_one at mm/list_lru.c:193 super_cache_count+0xe8/0x170 do_shrink_slab+0x95/0x6d0 shrink_slab+0x41b/0x4a0 shrink_node+0x35c/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Reported by Kernel Concurrency Sanitizer on: CPU: 56 PID: 6345 Comm: oom01 Tainted: G W L 5.5.0-next-20200205+ #4 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 A shattered l.nr_items could affect the shrinker behaviour due to a data race. Fix it by adding READ_ONCE() for the read. Since the writes are aligned and up to word-size, assume those are safe from data races to avoid readability issues of writing WRITE_ONCE(var, var + val). Link: http://lkml.kernel.org/r/1581114679-5488-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/list_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 9222910ab1cb79..dbff67afe19734 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -180,7 +180,7 @@ unsigned long list_lru_count_one(struct list_lru *lru, rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); - count = l->nr_items; + count = READ_ONCE(l->nr_items); rcu_read_unlock(); return count; From e1e2bcce11f9ec273ce29fd770e4826b90611e6b Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:59 +1000 Subject: [PATCH 088/241] mm/mempool: fix a data race in mempool_free() mempool_t pool.curr_nr could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in mempool_free / remove_element write to 0xffffffffa937638c of 4 bytes by task 6359 on cpu 113: remove_element+0x4a/0x1c0 remove_element at mm/mempool.c:132 mempool_alloc+0x102/0x210 (inlined by) mempool_alloc at mm/mempool.c:399 bio_alloc_bioset+0x106/0x2c0 get_swap_bio+0x49/0x230 __swap_writepage+0x680/0xc30 swap_writepage+0x9c/0xf0 pageout+0x33e/0xae0 shrink_page_list+0x1f57/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 do_try_to_free_pages+0x1f7/0xa10 try_to_free_pages+0x26c/0x5e0 __alloc_pages_slowpath+0x458/0x1290 read to 0xffffffffa937638c of 4 bytes by interrupt on cpu 64: mempool_free+0x3e/0x150 mempool_free at mm/mempool.c:492 bio_free+0x192/0x280 bio_put+0x91/0xd0 end_swap_bio_write+0x1d8/0x280 bio_endio+0x2c2/0x5b0 dec_pending+0x22b/0x440 [dm_mod] clone_endio+0xe4/0x2c0 [dm_mod] bio_endio+0x2c2/0x5b0 blk_update_request+0x217/0x940 scsi_end_request+0x6b/0x4d0 scsi_io_completion+0xb7/0x7e0 scsi_finish_command+0x223/0x310 scsi_softirq_done+0x1d5/0x210 blk_mq_complete_request+0x224/0x250 scsi_mq_done+0xc2/0x250 pqi_raid_io_complete+0x5a/0x70 [smartpqi] pqi_irq_handler+0x150/0x1410 [smartpqi] __handle_irq_event_percpu+0x90/0x540 handle_irq_event_percpu+0x49/0xd0 handle_irq_event+0x85/0xca handle_edge_irq+0x13f/0x3e0 do_IRQ+0x86/0x190 Since the write is under pool->lock but the read is done as lockless. Even though the commit 5b990546e334 ("mempool: fix and document synchronization and memory barrier usage") introduced the smp_wmb() and smp_rmb() pair to improve the situation, it is adequate to protect it from data races which could lead to a logic bug, so fix it by adding READ_ONCE() for the read. Link: http://lkml.kernel.org/r/1581446384-2131-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Cc: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index 85efab3da72044..79bff63ecf2748 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -489,7 +489,7 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (unlikely(pool->curr_nr < pool->min_nr)) { + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); From ec561d5e2dfd6136742952c91f3ff682626d26ec Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:45:59 +1000 Subject: [PATCH 089/241] mm/util.c: annotate an data race at vm_committed_as "vm_committed_as.count" could be accessed concurrently as reported by KCSAN, read to 0xffffffff923164f8 of 8 bytes by task 1268 on cpu 38: __vm_enough_memory+0x43/0x280 mm/util.c:801 mmap_region+0x1b2/0xb90 mm/mmap.c:1726 do_mmap+0x45c/0x700 vm_mmap_pgoff+0xc0/0x130 vm_mmap+0x71/0x90 elf_map+0xa1/0x1b0 load_elf_binary+0x9de/0x2180 search_binary_handler+0xd8/0x2b0 __do_execve_file+0xb61/0x1080 __x64_sys_execve+0x5f/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe write to 0xffffffff923164f8 of 8 bytes by task 1265 on cpu 41: percpu_counter_add_batch+0x83/0xd0 lib/percpu_counter.c:91 exit_mmap+0x178/0x220 include/linux/mman.h:68 mmput+0x10e/0x270 flush_old_exec+0x572/0xfe0 load_elf_binary+0x467/0x2180 search_binary_handler+0xd8/0x2b0 __do_execve_file+0xb61/0x1080 __x64_sys_execve+0x5f/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe The warning is almost impossible to trigger according to the commit 82f71ae4a2b8 ("mm: catch memory commitment underflow") but leave it for now to catch any possible unbalanced vm_unacct_memory() in the future. Since only the read is operating as lockless, mark it as an intentional data race using the data_race() macro to avoid modifying percpu_counter_read() and still catch unintended races elsewhere. Link: http://lkml.kernel.org/r/1581518109-21180-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Christoph Lameter Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/util.c b/mm/util.c index db059539de24ec..4ff20121f07bc2 100644 --- a/mm/util.c +++ b/mm/util.c @@ -814,8 +814,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), + /* + * A transient decrease in the value is unlikely, so no need + * READ_ONCE() for vm_committed_as.count. + */ + VM_WARN_ONCE(data_race(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus()), "memory commitment underflow"); vm_acct_memory(pages); From 6ed810960de8d0f1d23fa62bb279b91cc5196dfe Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:46:00 +1000 Subject: [PATCH 090/241] mm/rmap: annotate a data race at tlb_flush_batched mm->tlb_flush_batched could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one write to 0xffff93f754880bd0 of 1 bytes by task 822 on cpu 6: try_to_unmap_one+0x59a/0x1ab0 set_tlb_ubc_flush_pending at mm/rmap.c:635 (inlined by) try_to_unmap_one at mm/rmap.c:1538 rmap_walk_anon+0x296/0x650 rmap_walk+0xdf/0x100 try_to_unmap+0x18a/0x2f0 shrink_page_list+0xef6/0x2870 shrink_inactive_list+0x316/0x880 shrink_lruvec+0x8dc/0x1380 shrink_node+0x317/0xd80 balance_pgdat+0x652/0xd90 kswapd+0x396/0x8d0 kthread+0x1e0/0x200 ret_from_fork+0x27/0x50 read to 0xffff93f754880bd0 of 1 bytes by task 6364 on cpu 4: flush_tlb_batched_pending+0x29/0x90 flush_tlb_batched_pending at mm/rmap.c:682 change_p4d_range+0x5dd/0x1030 change_pte_range at mm/mprotect.c:44 (inlined by) change_pmd_range at mm/mprotect.c:212 (inlined by) change_pud_range at mm/mprotect.c:240 (inlined by) change_p4d_range at mm/mprotect.c:260 change_protection+0x222/0x310 change_prot_numa+0x3e/0x60 task_numa_work+0x219/0x350 task_work_run+0xed/0x140 prepare_exit_to_usermode+0x2cc/0x2e0 ret_from_intr+0x32/0x42 Reported by Kernel Concurrency Sanitizer on: CPU: 4 PID: 6364 Comm: mtest01 Tainted: G W L 5.5.0-next-20200210+ #5 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 flush_tlb_batched_pending() is under PTL but the write is not, but mm->tlb_flush_batched is only a bool type, so the value is unlikely to be shattered. Thus, mark it as an intentional data race by using the data race macro. Link: http://lkml.kernel.org/r/1581450783-8262-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index ad4a0fdcc94c39..bd98a995c5731e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -672,7 +672,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (mm->tlb_flush_batched) { + if (data_race(mm->tlb_flush_batched)) { flush_tlb_mm(mm); /* From 1c2e7a543d2f8173fc57d5c1ad583e197b688d9a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:46:00 +1000 Subject: [PATCH 091/241] mm: annotate a data race in page_zonenum() BUG: KCSAN: data-race in page_cpupid_xchg_last / put_page write (marked) to 0xfffffc0d48ec1a00 of 8 bytes by task 91442 on cpu 3: page_cpupid_xchg_last+0x51/0x80 page_cpupid_xchg_last at mm/mmzone.c:109 (discriminator 11) wp_page_reuse+0x3e/0xc0 wp_page_reuse at mm/memory.c:2453 do_wp_page+0x472/0x7b0 do_wp_page at mm/memory.c:2798 __handle_mm_fault+0xcb0/0xd00 handle_pte_fault at mm/memory.c:4049 (inlined by) __handle_mm_fault at mm/memory.c:4163 handle_mm_fault+0xfc/0x2f0 handle_mm_fault at mm/memory.c:4200 do_page_fault+0x263/0x6f9 do_user_addr_fault at arch/x86/mm/fault.c:1465 (inlined by) do_page_fault at arch/x86/mm/fault.c:1539 page_fault+0x34/0x40 read to 0xfffffc0d48ec1a00 of 8 bytes by task 94817 on cpu 69: put_page+0x15a/0x1f0 page_zonenum at include/linux/mm.h:923 (inlined by) is_zone_device_page at include/linux/mm.h:929 (inlined by) page_is_devmap_managed at include/linux/mm.h:948 (inlined by) put_page at include/linux/mm.h:1023 wp_page_copy+0x571/0x930 wp_page_copy at mm/memory.c:2615 do_wp_page+0x107/0x7b0 __handle_mm_fault+0xcb0/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 Reported by Kernel Concurrency Sanitizer on: CPU: 69 PID: 94817 Comm: systemd-udevd Tainted: G W O L 5.5.0-next-20200204+ #6 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 A page never changes its zone number. The zone number happens to be stored in the same word as other bits which are modified, but the zone number bits will never be modified by any other write, so it can accept a reload of the zone bits after an intervening write and it don't need to use READ_ONCE(). Thus, annotate this data race using ASSERT_EXCLUSIVE_BITS() to also assert that there are no concurrent writes to it. Link: http://lkml.kernel.org/r/1581619089-14472-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Suggested-by: Marco Elver Cc: Paul E. McKenney Cc: David Hildenbrand Cc: Jan Kara Cc: John Hubbard Cc: Ira Weiny Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index fd14d870cc9c33..caa851d44bf344 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1056,6 +1056,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); static inline enum zone_type page_zonenum(const struct page *page) { + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } From 6ecf6519f56eb4bd53880c4c751057b3847bac5d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 4 Jun 2020 11:46:01 +1000 Subject: [PATCH 092/241] mm/swap.c: annotate data races for lru_rotate_pvecs Read to lru_add_pvec->nr could be interrupted and then write to the same variable. The write has local interrupt disabled, but the plain reads result in data races. However, it is unlikely the compilers could do much damage here given that lru_add_pvec->nr is a "unsigned char" and there is an existing compiler barrier. Thus, annotate the reads using the data_race() macro. The data races were reported by KCSAN, BUG: KCSAN: data-race in lru_add_drain_cpu / rotate_reclaimable_page write to 0xffff9291ebcb8a40 of 1 bytes by interrupt on cpu 23: rotate_reclaimable_page+0x2df/0x490 pagevec_add at include/linux/pagevec.h:81 (inlined by) rotate_reclaimable_page at mm/swap.c:259 end_page_writeback+0x1b5/0x2b0 end_swap_bio_write+0x1d0/0x280 bio_endio+0x297/0x560 dec_pending+0x218/0x430 [dm_mod] clone_endio+0xe4/0x2c0 [dm_mod] bio_endio+0x297/0x560 blk_update_request+0x201/0x920 scsi_end_request+0x6b/0x4a0 scsi_io_completion+0xb7/0x7e0 scsi_finish_command+0x1ed/0x2a0 scsi_softirq_done+0x1c9/0x1d0 blk_done_softirq+0x181/0x1d0 __do_softirq+0xd9/0x57c irq_exit+0xa2/0xc0 do_IRQ+0x8b/0x190 ret_from_intr+0x0/0x42 delay_tsc+0x46/0x80 __const_udelay+0x3c/0x40 __udelay+0x10/0x20 kcsan_setup_watchpoint+0x202/0x3a0 __tsan_read1+0xc2/0x100 lru_add_drain_cpu+0xb8/0x3f0 lru_add_drain+0x25/0x40 shrink_active_list+0xe1/0xc80 shrink_lruvec+0x766/0xb70 shrink_node+0x2d6/0xca0 do_try_to_free_pages+0x1f7/0x9a0 try_to_free_pages+0x252/0x5b0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x16e/0x6f0 __handle_mm_fault+0xcd5/0xd40 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 read to 0xffff9291ebcb8a40 of 1 bytes by task 37761 on cpu 23: lru_add_drain_cpu+0xb8/0x3f0 lru_add_drain_cpu at mm/swap.c:602 lru_add_drain+0x25/0x40 shrink_active_list+0xe1/0xc80 shrink_lruvec+0x766/0xb70 shrink_node+0x2d6/0xca0 do_try_to_free_pages+0x1f7/0x9a0 try_to_free_pages+0x252/0x5b0 __alloc_pages_slowpath+0x458/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x16e/0x6f0 __handle_mm_fault+0xcd5/0xd40 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40 2 locks held by oom02/37761: #0: ffff9281e5928808 (&mm->mmap_sem#2){++++}, at: do_page_fault #1: ffffffffb3ade380 (fs_reclaim){+.+.}, at: fs_reclaim_acquire.part irq event stamp: 1949217 trace_hardirqs_on_thunk+0x1a/0x1c __do_softirq+0x2e7/0x57c __do_softirq+0x34c/0x57c irq_exit+0xa2/0xc0 Reported by Kernel Concurrency Sanitizer on: CPU: 23 PID: 37761 Comm: oom02 Not tainted 5.6.0-rc3-next-20200226+ #6 Hardware name: HP ProLiant BL660c Gen9, BIOS I38 10/17/2018 Link: http://lkml.kernel.org/r/20200228044018.1263-1-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fcec0..667133d25e5641 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -632,7 +632,8 @@ void lru_add_drain_cpu(int cpu) __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate.pvec, cpu); - if (pagevec_count(pvec)) { + /* Disabling interrupts below acts as a compiler barrier. */ + if (data_race(pagevec_count(pvec))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ @@ -793,7 +794,7 @@ void lru_add_drain_all(void) struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || - pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) || + data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || From 8652fe7a67346a89f9e7876e822bbff2ac29a3ee Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 4 Jun 2020 11:46:02 +1000 Subject: [PATCH 093/241] mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/perf/callchain_64.c | 4 +--- include/linux/mm.h | 10 +++++++-- kernel/events/core.c | 4 ++-- mm/gup.c | 29 ++++++++++++++------------ virt/kvm/kvm_main.c | 8 +++---- 7 files changed, 32 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 7a2a93f013b95b..7c5a1812a1c31f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -581,7 +581,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, * We always ask for write permission since the common case * is that the page is writable. */ - if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index ebe823168ecd39..4b437a3f09d416 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -806,7 +806,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index b63086b663ef26..9cc1a129737e87 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -30,11 +30,9 @@ int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned long addr = (unsigned long) ptr; unsigned long offset; struct page *page; - int nrpages; void *kaddr; - nrpages = __get_user_pages_fast(addr, 1, 1, &page); - if (nrpages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { kaddr = page_address(page); /* align address to page boundary */ diff --git a/include/linux/mm.h b/include/linux/mm.h index caa851d44bf344..fa8019ca2bbb8d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1825,10 +1825,16 @@ extern int mprotect_fixup(struct vm_area_struct *vma, /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a0856..63d66bbebbd504 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } diff --git a/mm/gup.c b/mm/gup.c index e19ff770eb4c87..9d1bf3ec8e394d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2294,7 +2294,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2699,7 +2699,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2811,8 +2811,14 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -/* +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the @@ -2825,8 +2831,8 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { int nr_pinned; /* @@ -2836,10 +2842,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; - - if (write) - gup_flags |= FOLL_WRITE; + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -2855,7 +2858,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr_pinned; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2926,8 +2929,8 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* - * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the - * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e165979..7b0da1c28e51d6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1740,7 +1740,6 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { struct page *page[1]; - int npages; /* * Fast pin a writable pfn only if it is a write fault request @@ -1750,8 +1749,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, if (!(write_fault || writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); if (writable) @@ -1791,7 +1789,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (unlikely(!write_fault) && writable) { struct page *wpage; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { *writable = true; put_page(page); page = wpage; @@ -2003,7 +2001,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); From 2041654c516aabfc779bbcb61ea035741305daec Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:03 +1000 Subject: [PATCH 094/241] mm/gup: update pin_user_pages.rst for "case 3" (mmu notifiers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update case 3 so that it covers the use of mmu notifiers, for hardware that does, or does not have replayable page faults. Also, elaborate case 4 slightly, as it was quite cryptic. Link: http://lkml.kernel.org/r/20200527194953.11130-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/core-api/pin_user_pages.rst | 33 +++++++++++++---------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86c6..4675b04e88290d 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,28 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). - -Therefore, neither flag needs to be set. - -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. + +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. + +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. page_maybe_dma_pinned(): the whole point of pinning =================================================== From 5c16174af774fb8d4e23fe6ea759117487b8d0cf Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:04 +1000 Subject: [PATCH 095/241] mm/gup: introduce pin_user_pages_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: introduce pin_user_pages_locked(), use it in frame_vector.c", v2. This adds yet one more pin_user_pages*() variant, and uses that to convert mm/frame_vector.c. With this, along with maybe 20 or 30 other recent patches in various trees, we are close to having the relevant gup call sites converted--with the notable exception of the bio/block layer. This patch (of 2): Introduce pin_user_pages_locked(), which is nearly identical to get_user_pages_locked() except that it sets FOLL_PIN and rejects FOLL_GET. As with other pairs of get_user_pages*() and pin_user_pages() API calls, it's prudent to assert that FOLL_PIN is *not* set in the get_user_pages*() call, so add that as part of this. Link: http://lkml.kernel.org/r/20200531234131.770697-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/mm.h | 2 ++ mm/gup.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index fa8019ca2bbb8d..b1b1431e2b475a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1707,6 +1707,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 9d1bf3ec8e394d..86da8642a97f4d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3058,3 +3058,33 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); + From ba6f6bbb33bddf6a815a0ef1d7bb3aa702e41ce6 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:04 +1000 Subject: [PATCH 096/241] mm/gup: introduce pin_user_pages_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add an assert-and-return to the corresponding get_user_pages_locked() call, to keep out any externally set FOLL_PIN flag, Link: http://lkml.kernel.org/r/20200531234131.770697-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Cc: Souptick Joarder Cc: Daniel Vetter Cc: Dave Chinner Cc: Jan Kara Cc: Jérôme Glisse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 86da8642a97f4d..680c8f4c49de29 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2035,6 +2035,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, From 87c432bf3a891f60f77554fe0b47c808282faf27 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:05 +1000 Subject: [PATCH 097/241] mm/gup: frame_vector: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), and all of the callers so far were in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Link: http://lkml.kernel.org/r/20200527223243.884385-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/frame_vector.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad551..4107dbca0056b1 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(start, nr_frames, + ret = pin_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames); */ void put_vaddr_frames(struct frame_vector *vec) { - int i; struct page **pages; if (!vec->got_ref) @@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec) */ if (WARN_ON(IS_ERR(pages))) goto out; - for (i = 0; i < vec->nr_frames; i++) - put_page(pages[i]); + + unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; From 280b8d61f93f5d63308e744a1c0cb05703c43e48 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:05 +1000 Subject: [PATCH 098/241] mm/gup: documentation fix for pin_user_pages*() APIs All of the pin_user_pages*() API calls will cause pages to be dma-pinned. As such, they are all suitable for either DMA, RDMA, and/or Direct IO. The documentation should say so, but it was instead saying that three of the API calls were only suitable for Direct IO. This was discovered when a reviewer wondered why an API call that specifically recommended against Case 2 (DMA/RDMA) was being used in a DMA situation [1]. Fix this by simply deleting those claims. The gup.c comments already refer to the more extensive Documentation/core-api/pin_user_pages.rst, which does have the correct guidance. So let's just write it once, there. [1] https://lore.kernel.org/r/20200529074658.GM30374@kadam Link: http://lkml.kernel.org/r/20200529084515.46259-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Dan Carpenter Cc: Jan Kara Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/gup.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 680c8f4c49de29..5f9b4d294ef1d5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2918,9 +2918,6 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2994,9 +2991,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -3030,9 +3024,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, From a65768dd7f35239cbe3632c3a15bc68b8fdc742e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:06 +1000 Subject: [PATCH 099/241] docs: mm/gup: pin_user_pages.rst: add a "case 5" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "vhost, docs: convert to pin_user_pages(), new "case 5"" It recently became clear to me that there are some get_user_pages*() callers that don't fit neatly into any of the four cases that are so far listed in pin_user_pages.rst. vhost.c is one of those. Add a Case 5 to the documentation, and refer to that when converting vhost.c. Thanks to Jan Kara for helping me (again) in understanding the interaction between get_user_pages() and page writeback [1]. This is based on today's mmotm, which has a nearby patch to pin_user_pages.rst that rewords cases 3 and 4. Note that I have only compile-tested the vhost.c patch, although that does also include cross-compiling for a few other arches. Any run-time testing would be greatly appreciated. [1] https://lore.kernel.org/r/20200529070343.GL14550@quack2.suse.cz This patch (of 2): There are four cases listed in pin_user_pages.rst. These are intended to help developers figure out whether to use get_user_pages*(), or pin_user_pages*(). However, the four cases do not cover all the situations. For example, drivers/vhost/vhost.c has a "pin, write to page, set page dirty, unpin" case. Add a fifth case, to help explain that there is a general pattern that requires pin_user_pages*() API calls. Link: http://lkml.kernel.org/r/20200529234309.484480-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200529234309.484480-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Vlastimil Babka Cc: Jan Kara Cc: Jérôme Glisse Cc: Dave Chinner Cc: Jonathan Corbet Cc: Souptick Joarder Cc: "Michael S . Tsirkin" Cc: Jason Wang Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- Documentation/core-api/pin_user_pages.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 4675b04e88290d..b9f2688a2c6744 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -171,6 +171,26 @@ If only struct page data (as opposed to the actual memory contents that a page is tracking) is affected, then normal GUP calls are sufficient, and neither flag needs to be set. +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +access page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + access the data within the pages + set_page_dirty_lock() + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + access the data within the pages + set_page_dirty_lock() + put_page() + page_maybe_dma_pinned(): the whole point of pinning =================================================== From 75ec0147108b22ef0ea1f3f7a9017556addbefa5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 4 Jun 2020 11:46:06 +1000 Subject: [PATCH 100/241] vhost: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), in approximately a "Case 5" scenario (accessing the data within a page), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Link: http://lkml.kernel.org/r/20200529234309.484480-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Cc: Michael S. Tsirkin Cc: Jason Wang Cc: Dave Chinner Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Souptick Joarder Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- drivers/vhost/vhost.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 694f1c31c3ae85..48de322cb9f72d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1814,15 +1814,14 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); + r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); - set_page_dirty_lock(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); return 0; } From 97ec4b1a3a4abf406bc06c2b595bca599d290504 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:07 +1000 Subject: [PATCH 101/241] h8300: remove usage of __ARCH_USE_5LEVEL_HACK Patch series "mm: remove __ARCH_HAS_5LEVEL_HACK", v4. These patches convert several architectures to use page table folding and remove __ARCH_HAS_5LEVEL_HACK along with include/asm-generic/5level-fixup.h and include/asm-generic/pgtable-nop4d-hack.h. With that we'll have a single and consistent way of dealing with page table folding instead of a mix of three existing options. The changes are mostly about mechanical replacement of pgd accessors with p4d ones and the addition of higher levels to page table traversals. This patch (of 14): h8300 is a nommu architecture and does not require fixup for upper layers of the page tables because it is already handled by the generic nommu implementation. Remove definition of __ARCH_USE_5LEVEL_HACK in arch/h8300/include/asm/pgtable.h Link: http://lkml.kernel.org/r/20200414153455.21744-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200414153455.21744-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne [openrisc] Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/h8300/include/asm/pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index 4d00152fab58d3..f00828720dc455 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _H8300_PGTABLE_H #define _H8300_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #include extern void paging_init(void); From 81125740bc7ee42354d84b87efe1e23125ee82ca Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:08 +1000 Subject: [PATCH 102/241] arm: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, and remove __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-3-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/include/asm/pgtable.h | 1 - arch/arm/lib/uaccess_with_memcpy.c | 7 +++++- arch/arm/mach-sa1100/assabet.c | 2 +- arch/arm/mm/dump.c | 29 +++++++++++++++++----- arch/arm/mm/fault-armv.c | 7 +++++- arch/arm/mm/fault.c | 22 ++++++++++------ arch/arm/mm/idmap.c | 3 ++- arch/arm/mm/init.c | 2 +- arch/arm/mm/ioremap.c | 12 ++++++--- arch/arm/mm/mm.h | 2 +- arch/arm/mm/mmu.c | 35 +++++++++++++++++++++----- arch/arm/mm/pgd.c | 40 ++++++++++++++++++++++++------ 12 files changed, 125 insertions(+), 37 deletions(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index befc8fcec98f75..fba20607c53cc0 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -17,7 +17,6 @@ #else -#define __ARCH_USE_5LEVEL_HACK #include #include #include diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c9450982a1558e..d72b14c966702d 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -24,6 +24,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) { unsigned long addr = (unsigned long)_addr; pgd_t *pgd; + p4d_t *p4d; pmd_t *pmd; pte_t *pte; pud_t *pud; @@ -33,7 +34,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d))) + return 0; + + pud = pud_offset(p4d, addr); if (unlikely(pud_none(*pud) || pud_bad(*pud))) return 0; diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index d96a101e550487..0631a7b026782f 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -633,7 +633,7 @@ static void __init map_sa1100_gpio_regs( void ) int prot = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_DOMAIN(DOMAIN_IO); pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); *pmd = __pmd(phys | prot); flush_pmd_entry(pmd); } diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 7d6291f23251e7..677549d6854c6a 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -207,6 +207,7 @@ struct pg_level { static struct pg_level pg_level[] = { { }, { /* pgd */ + }, { /* p4d */ }, { /* pud */ }, { /* pmd */ .bits = section_bits, @@ -308,7 +309,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start, for (i = 0; i < PTRS_PER_PTE; i++, pte++) { addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), domain); + note_page(st, addr, 5, pte_val(*pte), domain); } } @@ -350,14 +351,14 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) addr += SECTION_SIZE; pmd++; domain = get_domain_name(pmd); - note_page(st, addr, 3, pmd_val(*pmd), domain); + note_page(st, addr, 4, pmd_val(*pmd), domain); } } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned i; @@ -366,7 +367,23 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) if (!pud_none(*pud)) { walk_pmd(st, pud, addr); } else { - note_page(st, addr, 2, pud_val(*pud), NULL); + note_page(st, addr, 3, pud_val(*pud), NULL); + } + } +} + +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) { + walk_pud(st, p4d, addr); + } else { + note_page(st, addr, 2, p4d_val(*p4d), NULL); } } } @@ -381,7 +398,7 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { addr = start + i * PGDIR_SIZE; if (!pgd_none(*pgd)) { - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } else { note_page(st, addr, 1, pgd_val(*pgd), NULL); } diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index ae857f41f68dc9..489aaafa6ebd60 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -91,6 +91,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, { spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -100,7 +101,11 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pgd_none_or_clear_bad(pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 0; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2dd5c41cbb8d47..ff230e9affc49c 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -43,19 +43,21 @@ void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr) printk("%s[%08lx] *pgd=%08llx", lvl, addr, (long long)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) break; - if (pgd_bad(*pgd)) { + if (p4d_bad(*p4d)) { pr_cont("(bad)"); break; } - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%08llx", (long long)pud_val(*pud)); @@ -405,6 +407,7 @@ do_translation_fault(unsigned long addr, unsigned int fsr, { unsigned int index; pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -419,13 +422,16 @@ do_translation_fault(unsigned long addr, unsigned int fsr, pgd = cpu_get_pgd() + index; pgd_k = init_mm.pgd + index; - if (pgd_none(*pgd_k)) + p4d = p4d_offset(pgd, addr); + p4d_k = p4d_offset(pgd_k, addr); + + if (p4d_none(*p4d_k)) goto bad_area; - if (!pgd_present(*pgd)) - set_pgd(pgd, *pgd_k); + if (!p4d_present(*p4d)) + set_p4d(p4d, *p4d_k); - pud = pud_offset(pgd, addr); - pud_k = pud_offset(pgd_k, addr); + pud = pud_offset(p4d, addr); + pud_k = pud_offset(p4d_k, addr); if (pud_none(*pud_k)) goto bad_area; diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index a033f6134a6499..cd54411ef1b85d 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -68,7 +68,8 @@ static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, static void idmap_add_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long prot) { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 4e43455fab8426..2de1e757d54fba 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -519,7 +519,7 @@ static inline void section_update(unsigned long addr, pmdval_t mask, { pmd_t *pmd; - pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr); + pmd = pmd_off_k(addr); #ifdef CONFIG_ARM_LPAE pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot); diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 72286f9a4d3047..75529d76d28c65 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -142,12 +142,14 @@ static void unmap_area_sections(unsigned long virt, unsigned long size) { unsigned long addr = virt, end = virt + (size & ~(SZ_1M - 1)); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmdp; flush_cache_vunmap(addr, end); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmdp = pmd_offset(pud, addr); do { pmd_t pmd = *pmdp; @@ -190,6 +192,7 @@ remap_area_sections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -200,7 +203,8 @@ remap_area_sections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { pmd[0] = __pmd(__pfn_to_phys(pfn) | type->prot_sect); @@ -222,6 +226,7 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, { unsigned long addr = virt, end = virt + size; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -232,7 +237,8 @@ remap_area_supersections(unsigned long virt, unsigned long pfn, unmap_area_sections(virt, size); pgd = pgd_offset_k(virt); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_offset(p4d, addr); pmd = pmd_offset(pud, addr); do { unsigned long super_pmd_val, i; diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 88c121ac14b3d2..4f1f72b7589092 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -38,7 +38,7 @@ static inline pte_t get_top_pte(unsigned long va) static inline pmd_t *pmd_off_k(unsigned long virt) { - return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(virt), virt), virt), virt); } struct mem_type { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index ec8d0008bfa1c8..c425288f1a8605 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -357,7 +357,8 @@ static pte_t *pte_offset_late_fixmap(pmd_t *dir, unsigned long addr) static inline pmd_t * __init fixmap_pmd(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; @@ -801,12 +802,12 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, +static void __init alloc_init_pud(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys, const struct mem_type *type, void *(*alloc)(unsigned long sz), bool ng) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { @@ -816,6 +817,21 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } +static void __init alloc_init_p4d(pgd_t *pgd, unsigned long addr, + unsigned long end, phys_addr_t phys, + const struct mem_type *type, + void *(*alloc)(unsigned long sz), bool ng) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + alloc_init_pud(p4d, addr, next, phys, type, alloc, ng); + phys += next - addr; + } while (p4d++, addr = next, addr != end); +} + #ifndef CONFIG_ARM_LPAE static void __init create_36bit_mapping(struct mm_struct *mm, struct map_desc *md, @@ -863,7 +879,8 @@ static void __init create_36bit_mapping(struct mm_struct *mm, pgd = pgd_offset(mm, addr); end = addr + length; do { - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); int i; @@ -914,7 +931,7 @@ static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md, do { unsigned long next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, type, alloc, ng); + alloc_init_p4d(pgd, addr, next, phys, type, alloc, ng); phys += next - addr; addr = next; @@ -950,7 +967,13 @@ void __init create_mapping_late(struct mm_struct *mm, struct map_desc *md, bool ng) { #ifdef CONFIG_ARM_LPAE - pud_t *pud = pud_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_alloc(mm, pgd_offset(mm, md->virtual), md->virtual); + if (!WARN_ON(!p4d)) + return; + pud = pud_alloc(mm, p4d, md->virtual); if (WARN_ON(!pud)) return; pmd_alloc(mm, pud, 0); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 478bd2c6aa501a..c5e1b27046a882 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -30,6 +30,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd, *init_pgd; + p4d_t *new_p4d, *init_p4d; pud_t *new_pud, *init_pud; pmd_t *new_pmd, *init_pmd; pte_t *new_pte, *init_pte; @@ -53,8 +54,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* * Allocate PMD table for modules and pkmap mappings. */ - new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), + new_p4d = p4d_alloc(mm, new_pgd + pgd_index(MODULES_VADDR), MODULES_VADDR); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, MODULES_VADDR); if (!new_pud) goto no_pud; @@ -69,7 +74,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * contains the machine vectors. The vectors are always high * with LPAE. */ - new_pud = pud_alloc(mm, new_pgd, 0); + new_p4d = p4d_alloc(mm, new_pgd, 0); + if (!new_p4d) + goto no_p4d; + + new_pud = pud_alloc(mm, new_p4d, 0); if (!new_pud) goto no_pud; @@ -91,7 +100,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_val(*new_pmd) |= PMD_DOMAIN(DOMAIN_VECTORS); #endif - init_pud = pud_offset(init_pgd, 0); + init_p4d = p4d_offset(init_pgd, 0); + init_pud = pud_offset(init_p4d, 0); init_pmd = pmd_offset(init_pud, 0); init_pte = pte_offset_map(init_pmd, 0); set_pte_ext(new_pte + 0, init_pte[0], 0); @@ -108,6 +118,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) no_pmd: pud_free(mm, new_pud); no_pud: + p4d_free(mm, new_p4d); +no_p4d: __pgd_free(new_pgd); no_pgd: return NULL; @@ -116,6 +128,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgtable_t pte; @@ -127,7 +140,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) if (pgd_none_or_clear_bad(pgd)) goto no_pgd; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + goto no_p4d; + + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) goto no_pud; @@ -144,8 +161,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pmd_free(mm, pmd); mm_dec_nr_pmds(mm); no_pud: - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); +no_p4d: + pgd_clear(pgd); + p4d_free(mm, p4d); no_pgd: #ifdef CONFIG_ARM_LPAE /* @@ -156,15 +176,21 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) continue; if (pgd_val(*pgd) & L_PGD_SWAPPER) continue; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + if (p4d_none_or_clear_bad(p4d)) + continue; + pud = pud_offset(p4d, 0); if (pud_none_or_clear_bad(pud)) continue; pmd = pmd_offset(pud, 0); pud_clear(pud); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); - pgd_clear(pgd); + p4d_clear(p4d); pud_free(mm, pud); + mm_dec_nr_puds(mm); + pgd_clear(pgd); + p4d_free(mm, p4d); } #endif __pgd_free(pgd_base); From e87d76cf51e1f00a8e258589060b9015ec93393d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:08 +1000 Subject: [PATCH 103/241] arm-add-support-for-folded-p4d-page-tables-fix fix kexec Link: http://lkml.kernel.org/r/20200508174232.GA759899@linux.ibm.com Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 2de1e757d54fba..01e18e43b1742e 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -519,7 +519,7 @@ static inline void section_update(unsigned long addr, pmdval_t mask, { pmd_t *pmd; - pmd = pmd_off_k(addr); + pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, addr), addr), addr), addr); #ifdef CONFIG_ARM_LPAE pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot); From 78f5aa8e0ac2820fcd29a025fd427d76a167a7c5 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:09 +1000 Subject: [PATCH 104/241] arm64: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, replace 5level-fixup.h with pgtable-nop4d.h and remove __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/include/asm/kvm_mmu.h | 10 +- arch/arm64/include/asm/pgalloc.h | 10 +- arch/arm64/include/asm/pgtable-types.h | 5 +- arch/arm64/include/asm/pgtable.h | 37 +++-- arch/arm64/include/asm/stage2_pgtable.h | 48 ++++-- arch/arm64/kernel/hibernate.c | 44 ++++- arch/arm64/kvm/mmu.c | 209 ++++++++++++++++++++---- arch/arm64/mm/fault.c | 9 +- arch/arm64/mm/hugetlbpage.c | 15 +- arch/arm64/mm/kasan_init.c | 26 ++- arch/arm64/mm/mmu.c | 52 ++++-- arch/arm64/mm/pageattr.c | 7 +- 12 files changed, 368 insertions(+), 104 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 324c8483d2b900..f1a74163d764a6 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -172,8 +172,8 @@ void kvm_clear_hyp_idmap(void); __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) #define kvm_mk_pud(pmdp) \ __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_pgd(pudp) \ - __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_mk_p4d(pmdp) \ + __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) #define kvm_set_pud(pudp, pud) set_pud(pudp, pud) @@ -299,6 +299,12 @@ static inline bool kvm_s2pud_young(pud_t pud) #define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) #endif +#ifdef __PAGETABLE_P4D_FOLDED +#define hyp_p4d_table_empty(p4dp) (0) +#else +#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) +#endif + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 172d76fa02451d..58e93583ddb64f 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -73,17 +73,17 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pudp) free_page((unsigned long)pudp); } -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { - set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot)); + set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE); + __p4d_populate(p4dp, __pa(pudp), PUD_TYPE_TABLE); } #else -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) +static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { BUILD_BUG(); } diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index acb0751a6606bb..b8f158ae252736 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -14,6 +14,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; /* @@ -44,13 +45,11 @@ typedef struct { pteval_t pgprot; } pgprot_t; #define __pgprot(x) ((pgprot_t) { (x) } ) #if CONFIG_PGTABLE_LEVELS == 2 -#define __ARCH_USE_5LEVEL_HACK #include #elif CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include #elif CONFIG_PGTABLE_LEVELS == 4 -#include +#include #endif #endif /* __ASM_PGTABLE_TYPES_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9ce000f22d9e5e..1f3218fc52fc2b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -298,6 +298,11 @@ static inline pte_t pgd_pte(pgd_t pgd) return __pte(pgd_val(pgd)); } +static inline pte_t p4d_pte(p4d_t p4d) +{ + return __pte(p4d_val(p4d)); +} + static inline pte_t pud_pte(pud_t pud) { return __pte(pud_val(pud)); @@ -401,6 +406,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) +#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) +#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) + #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) #define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) @@ -592,49 +600,50 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!(pgd_val(pgd) & 2)) -#define pgd_present(pgd) (pgd_val(pgd)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!(p4d_val(p4d) & 2)) +#define p4d_present(p4d) (p4d_val(p4d)) -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { - if (in_swapper_pgdir(pgdp)) { - set_swapper_pgd(pgdp, pgd); + if (in_swapper_pgdir(p4dp)) { + set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d))); return; } - WRITE_ONCE(*pgdp, pgd); + WRITE_ONCE(*p4dp, p4d); dsb(ishst); isb(); } -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - set_pgd(pgdp, __pgd(0)); + set_p4d(p4dp, __p4d(0)); } -static inline phys_addr_t pgd_page_paddr(pgd_t pgd) +static inline phys_addr_t p4d_page_paddr(p4d_t p4d) { - return __pgd_to_phys(pgd); + return __p4d_to_phys(p4d); } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) -#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_set_fixmap_offset(p4d, addr) pud_set_fixmap(pud_offset_phys(p4d, addr)) #define pud_clear_fixmap() clear_fixmap(FIX_PUD) -#define pgd_page(pgd) phys_to_page(__pgd_to_phys(pgd)) +#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) /* use ONLY for statically allocated translation tables */ #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) #else +#define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;}) #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) /* Match pud_offset folding in */ diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 326aac658b9da8..9a364aeae5fbbe 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -68,41 +68,67 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm) #define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) + +static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + return p4d_offset(pgd, address); +} + +static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) +{ +} + +static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) +{ + return false; +} + +static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, + phys_addr_t addr, phys_addr_t end) +{ + return end; +} + +static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_none(pgd); + return p4d_none(p4d); else return 0; } -static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) { if (kvm_stage2_has_pud(kvm)) - pgd_clear(pgdp); + p4d_clear(p4dp); } -static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) +static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) { if (kvm_stage2_has_pud(kvm)) - return pgd_present(pgd); + return p4d_present(p4d); else return 1; } -static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) { if (kvm_stage2_has_pud(kvm)) - pgd_populate(NULL, pgd, pud); + p4d_populate(NULL, p4d, pud); } static inline pud_t *stage2_pud_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) + p4d_t *p4d, unsigned long address) { if (kvm_stage2_has_pud(kvm)) - return pud_offset(pgd, address); + return pud_offset(p4d, address); else - return (pud_t *)pgd; + return (pud_t *)p4d; } static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 5b73e92c99e353..a8a4b55f3a09d4 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -184,6 +184,7 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgprot_t pgprot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -196,7 +197,15 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgd_populate(&init_mm, pgdp, pudp); } - pudp = pud_offset(pgdp, dst_addr); + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = (void *)get_safe_page(GFP_ATOMIC); + if (!pudp) + return -ENOMEM; + p4d_populate(&init_mm, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); if (pud_none(READ_ONCE(*pudp))) { pmdp = (void *)get_safe_page(GFP_ATOMIC); if (!pmdp) @@ -419,7 +428,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, return 0; } -static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, +static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, unsigned long end) { pud_t *dst_pudp; @@ -427,15 +436,15 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, unsigned long next; unsigned long addr = start; - if (pgd_none(READ_ONCE(*dst_pgdp))) { + if (p4d_none(READ_ONCE(*dst_p4dp))) { dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); if (!dst_pudp) return -ENOMEM; - pgd_populate(&init_mm, dst_pgdp, dst_pudp); + p4d_populate(&init_mm, dst_p4dp, dst_pudp); } - dst_pudp = pud_offset(dst_pgdp, start); + dst_pudp = pud_offset(dst_p4dp, start); - src_pudp = pud_offset(src_pgdp, start); + src_pudp = pud_offset(src_p4dp, start); do { pud_t pud = READ_ONCE(*src_pudp); @@ -454,6 +463,27 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, return 0; } +static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, unsigned long end) { @@ -466,7 +496,7 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, next = pgd_addr_end(addr, end); if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_pud(dst_pgdp, src_pgdp, addr, next)) + if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a1f6bc70c4e45e..8848f062919b97 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -158,13 +158,22 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(kvm, pud_table); + stage2_p4d_free(kvm, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr) +{ + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); + stage2_p4d_clear(kvm, p4d); + kvm_tlb_flush_vmid_ipa(kvm, addr); + stage2_pud_free(kvm, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); @@ -208,12 +217,20 @@ static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) dsb(ishst); } -static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) +static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) { - WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); + WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); dsb(ishst); } +static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) +{ +#ifndef __PAGETABLE_P4D_FOLDED + WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); + dsb(ishst); +#endif +} + /* * Unmapping vs dcache management: * @@ -293,13 +310,13 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(kvm, pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -317,6 +334,23 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); if (stage2_pud_table_empty(kvm, start_pud)) + clear_stage2_p4d_entry(kvm, p4d, start_addr); +} + +static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next, start_addr = addr; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + unmap_stage2_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (stage2_p4d_table_empty(kvm, start_p4d)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -351,7 +385,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) break; next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_puds(kvm, pgd, addr, next); + unmap_stage2_p4ds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock * to prevent starvation and lockup detector warnings. @@ -391,13 +425,13 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, } while (pmd++, addr = next, addr != end); } -static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -409,6 +443,20 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); } +static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_flush_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -421,7 +469,7 @@ static void stage2_flush_memslot(struct kvm *kvm, do { next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_puds(kvm, pgd, addr, next); + stage2_flush_p4ds(kvm, pgd, addr, next); if (next != end) cond_resched_lock(&kvm->mmu_lock); @@ -454,12 +502,21 @@ static void stage2_flush_vm(struct kvm *kvm) static void clear_hyp_pgd_entry(pgd_t *pgd) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); pgd_clear(pgd); - pud_free(NULL, pud_table); + p4d_free(NULL, p4d_table); put_page(virt_to_page(pgd)); } +static void clear_hyp_p4d_entry(p4d_t *p4d) +{ + pud_t *pud_table __maybe_unused = pud_offset(p4d, 0); + VM_BUG_ON(p4d_huge(*p4d)); + p4d_clear(p4d); + pud_free(NULL, pud_table); + put_page(virt_to_page(p4d)); +} + static void clear_hyp_pud_entry(pud_t *pud) { pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); @@ -511,12 +568,12 @@ static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) clear_hyp_pud_entry(pud); } -static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { phys_addr_t next; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); /* Hyp doesn't use huge puds */ @@ -525,6 +582,23 @@ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) } while (pud++, addr = next, addr != end); if (hyp_pud_table_empty(start_pud)) + clear_hyp_p4d_entry(p4d); +} + +static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + p4d_t *p4d, *start_p4d; + + start_p4d = p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + /* Hyp doesn't use huge p4ds */ + if (!p4d_none(*p4d)) + unmap_hyp_puds(p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + if (hyp_p4d_table_empty(start_p4d)) clear_hyp_pgd_entry(pgd); } @@ -548,7 +622,7 @@ static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, do { next = pgd_addr_end(addr, end); if (!pgd_none(*pgd)) - unmap_hyp_puds(pgd, addr, next); + unmap_hyp_p4ds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -658,7 +732,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, return 0; } -static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, +static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -669,7 +743,7 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, addr = start; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) { pmd = pmd_alloc_one(NULL, addr); @@ -691,12 +765,45 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, return 0; } +static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, + unsigned long end, unsigned long pfn, + pgprot_t prot) +{ + p4d_t *p4d; + pud_t *pud; + unsigned long addr, next; + int ret; + + addr = start; + do { + p4d = p4d_offset(pgd, addr); + + if (p4d_none(*p4d)) { + pud = pud_alloc_one(NULL, addr); + if (!pud) { + kvm_err("Cannot allocate Hyp pud\n"); + return -ENOMEM; + } + kvm_p4d_populate(p4d, pud); + get_page(virt_to_page(p4d)); + } + + next = p4d_addr_end(addr, end); + ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); + if (ret) + return ret; + pfn += (next - addr) >> PAGE_SHIFT; + } while (addr = next, addr != end); + + return 0; +} + static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; unsigned long addr, next; int err = 0; @@ -707,18 +814,18 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); if (pgd_none(*pgd)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); + p4d = p4d_alloc_one(NULL, addr); + if (!p4d) { + kvm_err("Cannot allocate Hyp p4d\n"); err = -ENOMEM; goto out; } - kvm_pgd_populate(pgd, pud); + kvm_pgd_populate(pgd, p4d); get_page(virt_to_page(pgd)); } next = pgd_addr_end(addr, end); - err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); + err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); if (err) goto out; pfn += (next - addr) >> PAGE_SHIFT; @@ -1015,22 +1122,40 @@ void kvm_free_stage2_pgd(struct kvm *kvm) free_pages_exact(pgd, stage2_pgd_size(kvm)); } -static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, +static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr) { pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; - pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, pud); + p4d = mmu_memory_cache_alloc(cache); + stage2_pgd_populate(kvm, pgd, p4d); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(kvm, pgd, addr); + return stage2_p4d_offset(kvm, pgd, addr); +} + +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) +{ + p4d_t *p4d; + pud_t *pud; + + p4d = stage2_get_p4d(kvm, cache, addr); + if (stage2_p4d_none(kvm, *p4d)) { + if (!cache) + return NULL; + pud = mmu_memory_cache_alloc(cache); + stage2_p4d_populate(kvm, p4d, pud); + get_page(virt_to_page(p4d)); + } + + return stage2_pud_offset(kvm, p4d, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1423,18 +1548,18 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, } /** - * stage2_wp_puds - write protect PGD range + * stage2_wp_puds - write protect P4D range * @pgd: pointer to pgd entry * @addr: range start address * @end: range end address */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, +static void stage2_wp_puds(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(kvm, pgd, addr); + pud = stage2_pud_offset(kvm, p4d, addr); do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { @@ -1448,6 +1573,26 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, } while (pud++, addr = next, addr != end); } +/** + * stage2_wp_p4ds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + p4d_t *p4d; + phys_addr_t next; + + p4d = stage2_p4d_offset(kvm, pgd, addr); + do { + next = stage2_p4d_addr_end(kvm, addr, end); + if (!stage2_p4d_none(kvm, *p4d)) + stage2_wp_puds(kvm, p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + /** * stage2_wp_range() - write protect stage2 memory region range * @kvm: The KVM pointer @@ -1475,7 +1620,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) break; next = stage2_pgd_addr_end(kvm, addr, end); if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); + stage2_wp_p4ds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index dff2d72b0883bd..df8ae73d950b61 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -145,6 +145,7 @@ static void show_pte(unsigned long addr) pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); do { + p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -152,7 +153,13 @@ static void show_pte(unsigned long addr) if (pgd_none(pgd) || pgd_bad(pgd)) break; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); + pr_cont(", p4d=%016llx", p4d_val(p4d)); + if (p4d_none(p4d) || p4d_bad(p4d)) + break; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); pr_cont(", pud=%016llx", pud_val(pud)); if (pud_none(pud) || pud_bad(pud)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 07f154b8b84ad6..0a52ce46f0200e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -67,11 +67,13 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { pgd_t *pgdp = pgd_offset(mm, addr); + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; *pgsize = PAGE_SIZE; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_offset(p4dp, addr); pmdp = pmd_offset(pudp, addr); if ((pte_t *)pmdp == ptep) { *pgsize = PMD_SIZE; @@ -217,12 +219,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); - pudp = pud_alloc(mm, pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; @@ -261,6 +265,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -268,7 +273,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (!pgd_present(READ_ONCE(*pgdp))) return NULL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (!p4d_present(READ_ONCE(*p4dp))) + return NULL; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (sz != PUD_SIZE && pud_none(pud)) return NULL; diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index f87a32484ea82c..2339811f317b14 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -84,17 +84,17 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } -static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node, +static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { - if (pgd_none(READ_ONCE(*pgdp))) { + if (p4d_none(READ_ONCE(*p4dp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); - __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE); + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); } - return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr); + return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); } static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, @@ -126,11 +126,11 @@ static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, } while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp))); } -static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, +static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early); + pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early); do { next = pud_addr_end(addr, end); @@ -138,6 +138,18 @@ static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, } while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp))); } +static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + p4d_t *p4dp = p4d_offset(pgdp, addr); + + do { + next = p4d_addr_end(addr, end); + kasan_pud_populate(p4dp, addr, next, node, early); + } while (p4dp++, addr = next, addr != end); +} + static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, int node, bool early) { @@ -147,7 +159,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, pgdp = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_pud_populate(pgdp, addr, next, node, early); + kasan_p4d_populate(pgdp, addr, next, node, early); } while (pgdp++, addr = next, addr != end); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c299b73dd5e4e2..e7fbc627532992 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -290,18 +290,19 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, { unsigned long next; pud_t *pudp; - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - if (pgd_none(pgd)) { + if (p4d_none(p4d)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); - __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); - pgd = READ_ONCE(*pgdp); + __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE); + p4d = READ_ONCE(*p4dp); } - BUG_ON(pgd_bad(pgd)); + BUG_ON(p4d_bad(p4d)); - pudp = pud_set_fixmap_offset(pgdp, addr); + pudp = pud_set_fixmap_offset(p4dp, addr); do { pud_t old_pud = READ_ONCE(*pudp); @@ -672,6 +673,7 @@ static void __init map_kernel(pgd_t *pgdp) READ_ONCE(*pgd_offset_k(FIXADDR_START))); } else if (CONFIG_PGTABLE_LEVELS > 3) { pgd_t *bm_pgdp; + p4d_t *bm_p4dp; pud_t *bm_pudp; /* * The fixmap shares its top level pgd entry with the kernel @@ -681,7 +683,8 @@ static void __init map_kernel(pgd_t *pgdp) */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START); - bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START); + bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START); + bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START); pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd)); pud_clear_fixmap(); } else { @@ -715,6 +718,7 @@ void __init paging_init(void) int kern_addr_valid(unsigned long addr) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; @@ -726,7 +730,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(READ_ONCE(*pgdp))) return 0; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return 0; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return 0; @@ -1069,6 +1077,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, unsigned long addr = start; unsigned long next; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; @@ -1079,7 +1088,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, if (!pgdp) return -ENOMEM; - pudp = vmemmap_pud_populate(pgdp, addr, node); + p4dp = vmemmap_p4d_populate(pgdp, addr, node); + if (!p4dp) + return -ENOMEM; + + pudp = vmemmap_pud_populate(p4dp, addr, node); if (!pudp) return -ENOMEM; @@ -1114,11 +1127,12 @@ void vmemmap_free(unsigned long start, unsigned long end, static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgdp = pgd_offset_k(addr); - pgd_t pgd = READ_ONCE(*pgdp); + p4d_t *p4dp = p4d_offset(pgdp, addr); + p4d_t p4d = READ_ONCE(*p4dp); - BUG_ON(pgd_none(pgd) || pgd_bad(pgd)); + BUG_ON(p4d_none(p4d) || p4d_bad(p4d)); - return pud_offset_kimg(pgdp, addr); + return pud_offset_kimg(p4dp, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -1144,25 +1158,27 @@ static inline pte_t * fixmap_pte(unsigned long addr) */ void __init early_fixmap_init(void) { - pgd_t *pgdp, pgd; + pgd_t *pgdp; + p4d_t *p4dp, p4d; pud_t *pudp; pmd_t *pmdp; unsigned long addr = FIXADDR_START; pgdp = pgd_offset_k(addr); - pgd = READ_ONCE(*pgdp); + p4dp = p4d_offset(pgdp, addr); + p4d = READ_ONCE(*p4dp); if (CONFIG_PGTABLE_LEVELS > 3 && - !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) { + !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - pudp = pud_offset_kimg(pgdp, addr); + pudp = pud_offset_kimg(p4dp, addr); } else { - if (pgd_none(pgd)) - __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); + if (p4d_none(p4d)) + __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); pudp = fixmap_pud(addr); } if (pud_none(READ_ONCE(*pudp))) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index bde08090b8389d..4175bcb8ccb395 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -198,6 +198,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) bool kernel_page_present(struct page *page) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep; @@ -210,7 +211,11 @@ bool kernel_page_present(struct page *page) if (pgd_none(READ_ONCE(*pgdp))) return false; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_none(READ_ONCE(*p4dp))) + return false; + + pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; From 47f5a844c89562a7c4c65c6142a128dedf84735c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jun 2020 11:46:10 +1000 Subject: [PATCH 105/241] arm64: kvm: fix gcc-10 shift warning gcc-10 warns that the 32-bit zero cannot be shifted more than 32 bits to the right: arch/arm64/kvm/../../../virt/kvm/arm/mmu.c: In function 'clear_hyp_p4d_entry': arch/arm64/include/asm/pgtable.h:630:35: error: right shift count >= width of type [-Werror=shift-count-overflow] 630 | #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) | ^~ arch/arm64/include/asm/memory.h:271:45: note: in definition of macro '__phys_to_virt' 271 | #define __phys_to_virt(x) ((unsigned long)((x) - physvirt_offset)) | ^ arch/arm64/include/asm/pgtable.h:633:42: note: in expansion of macro '__va' 633 | #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) | ^~~~ arch/arm64/include/asm/pgtable.h:632:73: note: in expansion of macro 'pud_index' 632 | #define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) | ^~~~~~~~~ arch/arm64/include/asm/pgtable.h:633:47: note: in expansion of macro 'pud_offset_phys' 633 | #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) | ^~~~~~~~~~~~~~~ arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:510:36: note: in expansion of macro 'pud_offset' 510 | pud_t *pud_table __maybe_unused = pud_offset(p4d, 0); | ^~~~~~~~~~ This is harmless, and the warning is a little bit silly for a zero constant, but it's trivial to fix by making it an unsigned long, so do that. Link: http://lkml.kernel.org/r/20200429185657.4085975-1-arnd@arndb.de Fixes: 22998131ab33 ("arm64: add support for folded p4d page tables") Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Acked-by: Marc Zyngier Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8848f062919b97..290154e32c0bce 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -510,7 +510,7 @@ static void clear_hyp_pgd_entry(pgd_t *pgd) static void clear_hyp_p4d_entry(p4d_t *p4d) { - pud_t *pud_table __maybe_unused = pud_offset(p4d, 0); + pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); VM_BUG_ON(p4d_huge(*p4d)); p4d_clear(p4d); pud_free(NULL, pud_table); From 1468951aa6ca4834b68a986e53e9c3b1c95feaac Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:11 +1000 Subject: [PATCH 106/241] hexagon: remove __ARCH_USE_5LEVEL_HACK The hexagon architecture has 2 level page tables and as such most of the page table folding is already implemented in asm-generic/pgtable-nopmd.h. Fixup the only place in arch/hexagon to unfold the p4d level and remove __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/hexagon/include/asm/fixmap.h | 4 ++-- arch/hexagon/include/asm/pgtable.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/hexagon/include/asm/fixmap.h b/arch/hexagon/include/asm/fixmap.h index 933dac1675042e..97b1b062e7503a 100644 --- a/arch/hexagon/include/asm/fixmap.h +++ b/arch/hexagon/include/asm/fixmap.h @@ -16,7 +16,7 @@ #include #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), \ - (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), \ + (vaddr)), (vaddr)), (vaddr)), (vaddr)) #endif diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index d383e8bea5b240..2a17d4eb2fa42f 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -12,7 +12,6 @@ * Page table definitions for Qualcomm Hexagon processor. */ #include -#define __ARCH_USE_5LEVEL_HACK #include /* A handy thing to have if one has the RAM. Declared in head.S */ From 7767fd03e792714a10834a9be6da40f48cdf4bdd Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:12 +1000 Subject: [PATCH 107/241] ia64: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate, remove usage of __ARCH_USE_5LEVEL_HACK and replace 5level-fixup.h with pgtable-nop4d.h Link: http://lkml.kernel.org/r/20200414153455.21744-6-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/ia64/include/asm/pgalloc.h | 4 ++-- arch/ia64/include/asm/pgtable.h | 17 ++++++++--------- arch/ia64/mm/fault.c | 7 ++++++- arch/ia64/mm/hugetlbpage.c | 18 ++++++++++++------ arch/ia64/mm/init.c | 28 ++++++++++++++++++++++++---- 5 files changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index f4c49104488288..2a30503450995d 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -36,9 +36,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #if CONFIG_PGTABLE_LEVELS == 4 static inline void -pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) +p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud) { - pgd_val(*pgd_entry) = __pa(pud); + p4d_val(*p4d_entry) = __pa(pud); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 0e7b645b76c6f9..787b0a91d2553b 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -283,12 +283,12 @@ extern unsigned long VMALLOC_END; #define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) #if CONFIG_PGTABLE_LEVELS == 4 -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) -#define pgd_present(pgd) (pgd_val(pgd) != 0UL) -#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) -#define pgd_page(pgd) virt_to_page((pgd_val(pgd) + PAGE_OFFSET)) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (!ia64_phys_addr_valid(p4d_val(p4d))) +#define p4d_present(p4d) (p4d_val(p4d) != 0UL) +#define p4d_clear(p4dp) (p4d_val(*(p4dp)) = 0UL) +#define p4d_page_vaddr(p4d) ((unsigned long) __va(p4d_val(p4d) & _PFN_MASK)) +#define p4d_page(p4d) virt_to_page((p4d_val(p4d) + PAGE_OFFSET)) #endif /* @@ -386,7 +386,7 @@ pgd_offset (const struct mm_struct *mm, unsigned long address) #if CONFIG_PGTABLE_LEVELS == 4 /* Find an entry in the second-level page table.. */ #define pud_offset(dir,addr) \ - ((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + ((pud_t *) p4d_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #endif /* Find an entry in the third-level page table.. */ @@ -580,10 +580,9 @@ extern struct page *zero_page_memmap_ptr; #if CONFIG_PGTABLE_LEVELS == 3 -#define __ARCH_USE_5LEVEL_HACK #include #endif -#include +#include #include #endif /* _ASM_IA64_PGTABLE_H */ diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 30d0c1fca99ed7..12242aa0dad1ab 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -29,6 +29,7 @@ static int mapped_kernel_page_is_present (unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; @@ -37,7 +38,11 @@ mapped_kernel_page_is_present (unsigned long address) if (pgd_none(*pgd) || pgd_bad(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_bad(*p4d)) + return 0; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_bad(*pud)) return 0; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index d16e419fd7129f..32352a73df0c1c 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -30,12 +30,14 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pud = pud_alloc(mm, pgd, taddr); + p4d = p4d_offset(pgd, taddr); + pud = pud_alloc(mm, p4d, taddr); if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) @@ -49,17 +51,21 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); if (pgd_present(*pgd)) { - pud = pud_offset(pgd, taddr); - if (pud_present(*pud)) { - pmd = pmd_offset(pud, taddr); - if (pmd_present(*pmd)) - pte = pte_offset_map(pmd, taddr); + p4d = p4d_offset(pgd, addr); + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, taddr); + if (pud_present(*pud)) { + pmd = pmd_offset(pud, taddr); + if (pmd_present(*pmd)) + pte = pte_offset_map(pmd, taddr); + } } } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d637b4ea314773..ca760f6cb18f58 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -208,6 +208,7 @@ static struct page * __init put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -215,7 +216,10 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ { - pud = pud_alloc(&init_mm, pgd, address); + p4d = p4d_alloc(&init_mm, pgd, address); + if (!p4d) + goto out; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) goto out; pmd = pmd_alloc(&init_mm, pud, address); @@ -382,6 +386,7 @@ int vmemmap_find_next_valid_pfn(int node, int i) do { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +397,13 @@ int vmemmap_find_next_valid_pfn(int node, int i) continue; } - pud = pud_offset(pgd, end_address); + p4d = p4d_offset(pgd, end_address); + if (p4d_none(*p4d)) { + end_address += P4D_SIZE; + continue; + } + + pud = pud_offset(p4d, end_address); if (pud_none(*pud)) { end_address += PUD_SIZE; continue; @@ -430,6 +441,7 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) struct page *map_start, *map_end; int node; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -444,12 +456,20 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) for (address = start_page; address < end_page; address += PAGE_SIZE) { pgd = pgd_offset_k(address); if (pgd_none(*pgd)) { + p4d = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); + if (!p4d) + goto err_alloc; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, address); + + if (p4d_none(*p4d)) { pud = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); if (!pud) goto err_alloc; - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); } - pud = pud_offset(pgd, address); + pud = pud_offset(p4d, address); if (pud_none(*pud)) { pmd = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); From f4ff3fd3cd65bcda476f052e3c2881c7bf2abc22 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:12 +1000 Subject: [PATCH 108/241] nios2: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-7-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/nios2/include/asm/pgtable.h | 3 +-- arch/nios2/mm/fault.c | 9 +++++++-- arch/nios2/mm/ioremap.c | 6 +++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index f98b7f4519ba36..47a1a3ea5734a8 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -22,7 +22,6 @@ #include #include -#define __ARCH_USE_5LEVEL_HACK #include #define FIRST_USER_ADDRESS 0UL @@ -100,7 +99,7 @@ extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; */ static inline void set_pmd(pmd_t *pmdptr, pmd_t pmdval) { - pmdptr->pud.pgd.pgd = pmdval.pud.pgd.pgd; + *pmdptr = pmdval; } /* to find an entry in a page-table-directory */ diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index ec9d8a9c426fcd..964eac1a21d0e1 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -242,6 +242,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -253,8 +254,12 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, goto no_context; set_pgd(pgd, *pgd_k); - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); diff --git a/arch/nios2/mm/ioremap.c b/arch/nios2/mm/ioremap.c index 819bdfcc2e714d..fe821efb9a9986 100644 --- a/arch/nios2/mm/ioremap.c +++ b/arch/nios2/mm/ioremap.c @@ -86,11 +86,15 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, if (address >= end) BUG(); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; error = -ENOMEM; - pud = pud_alloc(&init_mm, dir, address); + p4d = p4d_alloc(&init_mm, dir, address); + if (!p4d) + break; + pud = pud_alloc(&init_mm, p4d, address); if (!pud) break; pmd = pmd_alloc(&init_mm, pud, address); From 1cc815fde7d5ab3b4d533fce8b12dc9e636e94ca Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:13 +1000 Subject: [PATCH 109/241] openrisc: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-8-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/openrisc/include/asm/pgtable.h | 1 - arch/openrisc/mm/fault.c | 10 ++++++++-- arch/openrisc/mm/init.c | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 7f3fb9ceb0839f..219979e577908a 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -21,7 +21,6 @@ #ifndef __ASM_OPENRISC_PGTABLE_H #define __ASM_OPENRISC_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #ifndef __ASSEMBLY__ diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 8af1cc78c4fb7f..6e0a11ac4c00d4 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -295,6 +295,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, int offset = pgd_index(address); pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; @@ -321,8 +322,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, * it exists. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto no_context; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto no_context; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index f94fe6d3f499fc..3bcdc1c26b2301 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -68,6 +68,7 @@ static void __init map_ram(void) unsigned long v, p, e; pgprot_t prot; pgd_t *pge; + p4d_t *p4e; pud_t *pue; pmd_t *pme; pte_t *pte; @@ -87,7 +88,8 @@ static void __init map_ram(void) while (p < e) { int j; - pue = pud_offset(pge, v); + p4e = p4d_offset(pge, v); + pue = pud_offset(p4e, v); pme = pmd_offset(pue, v); if ((u32) pue != (u32) pge || (u32) pme != (u32) pge) { From c8e3823b14c243116af2854efeaa817047e53e45 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:13 +1000 Subject: [PATCH 110/241] powerpc: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h. Link: http://lkml.kernel.org/r/20200414153455.21744-9-rppt@kernel.org Signed-off-by: Mike Rapoport Tested-by: Christophe Leroy # 8xx and 83xx Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/include/asm/book3s/32/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/hash.h | 4 +- arch/powerpc/include/asm/book3s/64/pgalloc.h | 4 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 60 ++++++++++--------- arch/powerpc/include/asm/book3s/64/radix.h | 6 +- arch/powerpc/include/asm/nohash/32/pgtable.h | 1 - arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 +- .../include/asm/nohash/64/pgtable-4k.h | 32 +++++----- arch/powerpc/include/asm/nohash/64/pgtable.h | 6 +- arch/powerpc/include/asm/pgtable.h | 10 ++-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 32 ++++++---- arch/powerpc/lib/code-patching.c | 7 ++- arch/powerpc/mm/book3s64/hash_pgtable.c | 4 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 26 +++++--- arch/powerpc/mm/book3s64/subpage_prot.c | 6 +- arch/powerpc/mm/hugetlbpage.c | 28 +++++---- arch/powerpc/mm/nohash/book3e_pgtable.c | 15 ++--- arch/powerpc/mm/pgtable.c | 30 ++++++---- arch/powerpc/mm/pgtable_64.c | 10 ++-- arch/powerpc/mm/ptdump/hashpagetable.c | 20 ++++++- arch/powerpc/mm/ptdump/ptdump.c | 12 ++-- arch/powerpc/xmon/xmon.c | 18 +++--- 22 files changed, 196 insertions(+), 138 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 8a091d125f2d6d..d7978a5a79c39e 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_BOOK3S_32_PGTABLE_H #define _ASM_POWERPC_BOOK3S_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #include diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 6fc4520092c7b5..73ad038ed10b9d 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -134,9 +134,9 @@ static inline int get_region_id(unsigned long ea) #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) -static inline int hash__pgd_bad(pgd_t pgd) +static inline int hash__p4d_bad(p4d_t p4d) { - return (pgd_val(pgd) == 0); + return (p4d_val(p4d) == 0); } #ifdef CONFIG_STRICT_KERNEL_RWX extern void hash__mark_rodata_ro(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index a41e91bd058050..69c5b051734f04 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -85,9 +85,9 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud) { - *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b78d00d617830b..f17442c3a09284 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ -#include +#include #ifndef __ASSEMBLY__ #include @@ -251,7 +251,7 @@ extern unsigned long __pmd_frag_size_shift; /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0xc0000000000000ffUL /* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL +#define P4D_MASKED_BITS 0xc0000000000000ffUL /* * Used as an indicator for rcu callback functions @@ -954,54 +954,60 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return pte_access_permitted(pud_pte(pud), write); } -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define __p4d_raw(x) ((p4d_t) { __pgd_raw(x) }) +static inline __be64 p4d_raw(p4d_t x) +{ + return pgd_raw(x.pgd); +} + +#define p4d_write(p4d) pte_write(p4d_pte(p4d)) -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline int pgd_none(pgd_t pgd) +static inline int p4d_none(p4d_t p4d) { - return !pgd_raw(pgd); + return !p4d_raw(p4d); } -static inline int pgd_present(pgd_t pgd) +static inline int p4d_present(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PRESENT)); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte_raw(pgd_raw(pgd)); + return __pte_raw(p4d_raw(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd_raw(pte_raw(pte)); + return __p4d_raw(pte_raw(pte)); } -static inline int pgd_bad(pgd_t pgd) +static inline int p4d_bad(p4d_t p4d) { if (radix_enabled()) - return radix__pgd_bad(pgd); - return hash__pgd_bad(pgd); + return radix__p4d_bad(p4d); + return hash__p4d_bad(p4d); } -#define pgd_access_permitted pgd_access_permitted -static inline bool pgd_access_permitted(pgd_t pgd, bool write) +#define p4d_access_permitted p4d_access_permitted +static inline bool p4d_access_permitted(p4d_t p4d, bool write) { - return pte_access_permitted(pgd_pte(pgd), write); + return pte_access_permitted(p4d_pte(p4d), write); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); /* Pointers in the page table tree are physical addresses */ #define __pgtable_ptr_val(ptr) __pa(ptr) #define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) #define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) -#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_page_vaddr(p4d) __va(p4d_val(p4d) & ~P4D_MASKED_BITS) static inline unsigned long pgd_index(unsigned long address) { @@ -1030,8 +1036,8 @@ static inline unsigned long pte_index(unsigned long address) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr)) +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + pud_index(addr)) #define pmd_offset(pudp,addr) \ (((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr)) #define pte_offset_kernel(dir,addr) \ @@ -1394,11 +1400,11 @@ static inline bool pud_is_leaf(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } -#define pgd_is_leaf pgd_is_leaf -#define pgd_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#define p4d_is_leaf p4d_is_leaf +#define p4d_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { - return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); + return !!(p4d_raw(p4d) & cpu_to_be64(_PAGE_PTE)); } #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 08c222d5b764b4..0cba794c4fb880 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -30,7 +30,7 @@ /* Don't have anything in the reserved bits and leaf bits */ #define RADIX_PMD_BAD_BITS 0x60000000000000e0UL #define RADIX_PUD_BAD_BITS 0x60000000000000e0UL -#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL +#define RADIX_P4D_BAD_BITS 0x60000000000000e0UL #define RADIX_PMD_SHIFT (PAGE_SHIFT + RADIX_PTE_INDEX_SIZE) #define RADIX_PUD_SHIFT (RADIX_PMD_SHIFT + RADIX_PMD_INDEX_SIZE) @@ -227,9 +227,9 @@ static inline int radix__pud_bad(pud_t pud) } -static inline int radix__pgd_bad(pgd_t pgd) +static inline int radix__p4d_bad(p4d_t p4d) { - return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); + return !!(p4d_val(p4d) & RADIX_P4D_BAD_BITS); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 639f3b3713ec7a..61fc9e8f12d3a4 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -2,7 +2,6 @@ #ifndef _ASM_POWERPC_NOHASH_32_PGTABLE_H #define _ASM_POWERPC_NOHASH_32_PGTABLE_H -#define __ARCH_USE_5LEVEL_HACK #include #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index b9534a793293a7..668aee6017e7e2 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -15,7 +15,7 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) +#define p4d_populate(MM, P4D, PUD) p4d_set(P4D, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index c40ec32b819460..81b1c54e3cf189 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H #define _ASM_POWERPC_NOHASH_64_PGTABLE_4K_H -#include +#include /* * Entries per page directory level. The PTE level must use a 64b record @@ -45,41 +45,41 @@ #define PMD_MASKED_BITS 0 /* Bits to mask out from a PUD to get to the PMD page */ #define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 +/* Bits to mask out from a P4D to get to the PUD page */ +#define P4D_MASKED_BITS 0 /* * 4-level page tables related bits */ -#define pgd_none(pgd) (!pgd_val(pgd)) -#define pgd_bad(pgd) (pgd_val(pgd) == 0) -#define pgd_present(pgd) (pgd_val(pgd) != 0) -#define pgd_page_vaddr(pgd) (pgd_val(pgd) & ~PGD_MASKED_BITS) +#define p4d_none(p4d) (!p4d_val(p4d)) +#define p4d_bad(p4d) (p4d_val(p4d) == 0) +#define p4d_present(p4d) (p4d_val(p4d) != 0) +#define p4d_page_vaddr(p4d) (p4d_val(p4d) & ~P4D_MASKED_BITS) #ifndef __ASSEMBLY__ -static inline void pgd_clear(pgd_t *pgdp) +static inline void p4d_clear(p4d_t *p4dp) { - *pgdp = __pgd(0); + *p4dp = __p4d(0); } -static inline pte_t pgd_pte(pgd_t pgd) +static inline pte_t p4d_pte(p4d_t p4d) { - return __pte(pgd_val(pgd)); + return __pte(p4d_val(p4d)); } -static inline pgd_t pte_pgd(pte_t pte) +static inline p4d_t pte_p4d(pte_t pte) { - return __pgd(pte_val(pte)); + return __p4d(pte_val(pte)); } -extern struct page *pgd_page(pgd_t pgd); +extern struct page *p4d_page(p4d_t p4d); #endif /* !__ASSEMBLY__ */ -#define pud_offset(pgdp, addr) \ - (((pud_t *) pgd_page_vaddr(*(pgdp))) + \ +#define pud_offset(p4dp, addr) \ + (((pud_t *) p4d_page_vaddr(*(p4dp))) + \ (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) #define pud_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 9c703b140d6492..3424381b81dabd 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -175,11 +175,11 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) -#define pgd_write(pgd) pte_write(pgd_pte(pgd)) +#define p4d_write(pgd) pte_write(p4d_pte(p4d)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) +static inline void p4d_set(p4d_t *p4dp, unsigned long val) { - *pgdp = __pgd(val); + *p4dp = __p4d(val); } /* diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 961895be932afc..ae58b524a92422 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,12 +44,12 @@ struct mm_struct; #ifdef CONFIG_PPC32 static inline pmd_t *pmd_ptr(struct mm_struct *mm, unsigned long va) { - return pmd_offset(pud_offset(pgd_offset(mm, va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_ptr_k(unsigned long va) { - return pmd_offset(pud_offset(pgd_offset_k(va), va), va); + return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) @@ -160,9 +160,9 @@ static inline bool pud_is_leaf(pud_t pud) } #endif -#ifndef pgd_is_leaf -#define pgd_is_leaf pgd_is_leaf -static inline bool pgd_is_leaf(pgd_t pgd) +#ifndef p4d_is_leaf +#define p4d_is_leaf p4d_is_leaf +static inline bool p4d_is_leaf(p4d_t p4d) { return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 4b437a3f09d416..117274e69deb18 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -514,13 +514,14 @@ void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) unsigned long ig; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + p4d_t *p4d = p4d_offset(pgd, 0); pud_t *pud; - if (!pgd_present(*pgd)) + if (!p4d_present(*p4d)) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(p4d, 0); kvmppc_unmap_free_pud(kvm, pud, lpid); - pgd_clear(pgd); + p4d_clear(p4d); } } @@ -581,6 +582,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud, *new_pud = NULL; pmd_t *pmd, *new_pmd = NULL; pte_t *ptep, *new_ptep = NULL; @@ -588,9 +590,11 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Traverse the guest's 2nd-level tree, allocate new levels needed */ pgd = pgtable + pgd_index(gpa); + p4d = p4d_offset(pgd, gpa); + pud = NULL; - if (pgd_present(*pgd)) - pud = pud_offset(pgd, gpa); + if (p4d_present(*p4d)) + pud = pud_offset(p4d, gpa); else new_pud = pud_alloc_one(kvm->mm, gpa); @@ -611,13 +615,13 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, /* Now traverse again under the lock and change the tree */ ret = -ENOMEM; - if (pgd_none(*pgd)) { + if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock; - pgd_populate(kvm->mm, pgd, new_pud); + p4d_populate(kvm->mm, p4d, new_pud); new_pud = NULL; } - pud = pud_offset(pgd, gpa); + pud = pud_offset(p4d, gpa); if (pud_is_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK; @@ -1263,7 +1267,8 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, unsigned long gpa; pgd_t *pgt; struct kvm_nested_guest *nested; - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ptep; @@ -1336,13 +1341,14 @@ static ssize_t debugfs_radix_read(struct file *file, char __user *buf, } pgdp = pgt + pgd_index(gpa); - pgd = READ_ONCE(*pgdp); - if (!(pgd_val(pgd) & _PAGE_PRESENT)) { - gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + p4dp = p4d_offset(pgdp, gpa); + p4d = READ_ONCE(*p4dp); + if (!(p4d_val(p4d) & _PAGE_PRESENT)) { + gpa = (gpa & P4D_MASK) + P4D_SIZE; continue; } - pudp = pud_offset(&pgd, gpa); + pudp = pud_offset(&p4d, gpa); pud = READ_ONCE(*pudp); if (!(pud_val(pud) & _PAGE_PRESENT)) { gpa = (gpa & PUD_MASK) + PUD_SIZE; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 5ecf0d635a8dfc..e64546b8875c2a 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -113,13 +113,18 @@ static inline int unmap_patch_area(unsigned long addr) pte_t *ptep; pmd_t *pmdp; pud_t *pudp; + p4d_t *p4dp; pgd_t *pgdp; pgdp = pgd_offset_k(addr); if (unlikely(!pgdp)) return -EINVAL; - pudp = pud_offset(pgdp, addr); + p4dp = p4d_offset(pgdp, addr); + if (unlikely(!p4dp)) + return -EINVAL; + + pudp = pud_offset(p4dp, addr); if (unlikely(!pudp)) return -EINVAL; diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64ca375278dcc8..8b4b0a6021582c 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 408176086dd53c..8acb96de0e48a5 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -65,17 +65,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, region_start, region_end); - pgd_populate(&init_mm, pgdp, pudp); + p4d_populate(&init_mm, p4dp, pudp); } - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (map_page_size == PUD_SIZE) { ptep = (pte_t *)pudp; goto set_the_pte; @@ -115,6 +117,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, { unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -137,7 +140,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, * boot. */ pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) { @@ -174,6 +178,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end, { unsigned long idx; pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -186,7 +191,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end, for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); - pudp = pud_alloc(&init_mm, pgdp, idx); + p4dp = p4d_offset(pgdp, idx); + pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; if (pud_is_leaf(*pudp)) { @@ -850,6 +856,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) unsigned long addr, next; pud_t *pud_base; pgd_t *pgd; + p4d_t *p4d; spin_lock(&init_mm.page_table_lock); @@ -857,15 +864,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) continue; - if (pgd_is_leaf(*pgd)) { - split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); + if (p4d_is_leaf(*p4d)) { + split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d); continue; } - pud_base = (pud_t *)pgd_page_vaddr(*pgd); + pud_base = (pud_t *)p4d_page_vaddr(*p4d); remove_pud_table(pud_base, addr, next); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 2ef24a53f4c916..25a0c044bd9322 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, int npages) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) return; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return; pmd = pmd_offset(pud, addr); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 11e06b4c3ebe27..5b3d0140426616 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -110,6 +110,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pg; + p4d_t *p4; pud_t *pu; pmd_t *pm; hugepd_t *hpdp = NULL; @@ -119,20 +120,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz addr &= ~(sz-1); pg = pgd_offset(mm, addr); + p4 = p4d_offset(pg, addr); #ifdef CONFIG_PPC_BOOK3S_64 if (pshift == PGDIR_SHIFT) /* 16GB huge page */ - return (pte_t *) pg; + return (pte_t *) p4; else if (pshift > PUD_SHIFT) { /* * We need to use hugepd table */ ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift == PUD_SHIFT) @@ -157,10 +159,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz #else if (pshift >= PGDIR_SHIFT) { ptl = &mm->page_table_lock; - hpdp = (hugepd_t *)pg; + hpdp = (hugepd_t *)p4; } else { pdshift = PUD_SHIFT; - pu = pud_alloc(mm, pg, addr); + pu = pud_alloc(mm, p4, addr); if (!pu) return NULL; if (pshift >= PUD_SHIFT) { @@ -397,7 +399,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } -static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -407,7 +409,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, start = addr; do { - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); next = pud_addr_end(addr, end); if (!is_hugepd(__hugepd(pud_val(*pud)))) { if (pud_none_or_clear_bad(pud)) @@ -442,8 +444,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - pud = pud_offset(pgd, start); - pgd_clear(pgd); + pud = pud_offset(p4d, start); + p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } @@ -456,6 +458,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; + p4d_t *p4d; unsigned long next; /* @@ -478,10 +481,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); pgd = pgd_offset(tlb->mm, addr); + p4d = p4d_offset(pgd, addr); if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { - if (pgd_none_or_clear_bad(pgd)) + if (p4d_none_or_clear_bad(p4d)) continue; - hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); } else { unsigned long more; /* @@ -494,7 +498,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, if (more > next) next = more; - free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, addr, next, floor, ceiling); } } while (addr = next, addr != end); diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 4637fdd469cf07..77884e24281dd5 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size) int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); + p4dp = p4d_offset(pgdp, ea); + pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; pmdp = pmd_alloc(&init_mm, pudp, ea); @@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) return -ENOMEM; } else { pgdp = pgd_offset_k(ea); -#ifndef __PAGETABLE_PUD_FOLDED - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - pgd_populate(&init_mm, pgdp, pudp); + p4dp = p4d_offset(pgdp, ea); + if (p4d_none(*p4dp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + p4d_populate(&init_mm, p4dp, pmdp); } -#endif /* !__PAGETABLE_PUD_FOLDED */ - pudp = pud_offset(pgdp, ea); + pudp = pud_offset(p4dp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 60c4b8ff046cbf..cea5b4e25a248c 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -291,6 +291,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -298,7 +299,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) return; pgd = mm->pgd + pgd_index(addr); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); /* @@ -338,12 +341,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys); pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *hpage_shift) { - pgd_t pgd, *pgdp; + pgd_t *pgdp; + p4d_t p4d, *p4dp; pud_t pud, *pudp; pmd_t pmd, *pmdp; pte_t *ret_pte; hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; + unsigned pdshift; if (hpage_shift) *hpage_shift = 0; @@ -351,24 +355,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (is_thp) *is_thp = false; - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); /* * Always operate on the local stack value. This make sure the * value don't get updated by a parallel THP split/collapse, * page fault or a page unmap. The return pte_t * is still not * stable. So should be checked there for above conditions. + * Top level is an exception because it is folded into p4d. */ - if (pgd_none(pgd)) + pgdp = pgdir + pgd_index(ea); + p4dp = p4d_offset(pgdp, ea); + p4d = READ_ONCE(*p4dp); + pdshift = P4D_SHIFT; + + if (p4d_none(p4d)) return NULL; - if (pgd_is_leaf(pgd)) { - ret_pte = (pte_t *)pgdp; + if (p4d_is_leaf(p4d)) { + ret_pte = (pte_t *)p4dp; goto out; } - if (is_hugepd(__hugepd(pgd_val(pgd)))) { - hpdp = (hugepd_t *)&pgd; + if (is_hugepd(__hugepd(p4d_val(p4d)))) { + hpdp = (hugepd_t *)&p4d; goto out_huge; } @@ -378,7 +386,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, * irq disabled */ pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); + pudp = pud_offset(&p4d, ea); pud = READ_ONCE(*pudp); if (pud_none(pud)) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e78832dce7bb43..1f86a88fd4bb43 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift); #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ -struct page *pgd_page(pgd_t pgd) +struct page *p4d_page(p4d_t p4d) { - if (pgd_is_leaf(pgd)) { - VM_WARN_ON(!pgd_huge(pgd)); - return pte_page(pgd_pte(pgd)); + if (p4d_is_leaf(p4d)) { + VM_WARN_ON(!p4d_huge(p4d)); + return pte_page(p4d_pte(p4d)); } - return virt_to_page(pgd_page_vaddr(pgd)); + return virt_to_page(p4d_page_vaddr(p4d)); } #endif diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b6ed9578382ff8..6aaeb1eb3b9c57 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) } } +static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start) +{ + p4d_t *p4d = p4d_offset(pgd, 0); + unsigned long addr; + unsigned int i; + + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + addr = start + i * P4D_SIZE; + if (!p4d_none(*p4d)) + /* p4d exists */ + walk_pud(st, p4d, addr); + } +} + static void walk_pagetables(struct pg_state *st) { pgd_t *pgd = pgd_offset_k(0UL); @@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st) addr = KERN_VIRT_START + i * PGDIR_SIZE; if (!pgd_none(*pgd)) /* pgd exists */ - walk_pud(st, pgd, addr); + walk_p4d(st, pgd, addr); } } diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5fc880e301756b..36958c013e6571 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -306,9 +306,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(p4d, 0); unsigned long addr; unsigned int i; @@ -333,13 +333,15 @@ static void walk_pagetables(struct pg_state *st) * the hash pagetable. */ for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { + p4d_t *p4d = p4d_offset(pgd, 0); + if (pgd_none(*pgd) || pgd_is_leaf(*pgd)) - note_page(st, addr, 1, pgd_val(*pgd), PGDIR_SIZE); - else if (is_hugepd(__hugepd(pgd_val(*pgd)))) + note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); + else if (is_hugepd(__hugepd(p4d_val(*p4d)))) walk_hugepd(st, (hugepd_t *)pgd, addr, PGDIR_SHIFT, 1); else /* pgd exists */ - walk_pud(st, pgd, addr); + walk_pud(st, p4d, addr); } } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 9de08ef8395a8c..eb959cc619cada 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3214,6 +3214,7 @@ static void show_pte(unsigned long addr) struct task_struct *tsk = NULL; struct mm_struct *mm; pgd_t *pgdp, *pgdir; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; @@ -3245,20 +3246,21 @@ static void show_pte(unsigned long addr) pgdir = pgd_offset(mm, 0); } - if (pgd_none(*pgdp)) { - printf("no linux page table for address\n"); + p4dp = p4d_offset(pgdp, addr); + + if (p4d_none(*p4dp)) { + printf("No valid P4D\n"); return; } - printf("pgd @ 0x%px\n", pgdir); - - if (pgd_is_leaf(*pgdp)) { - format_pte(pgdp, pgd_val(*pgdp)); + if (p4d_is_leaf(*p4dp)) { + format_pte(p4dp, p4d_val(*p4dp)); return; } - printf("pgdp @ 0x%px = 0x%016lx\n", pgdp, pgd_val(*pgdp)); - pudp = pud_offset(pgdp, addr); + printf("p4dp @ 0x%px = 0x%016lx\n", p4dp, p4d_val(*p4dp)); + + pudp = pud_offset(p4dp, addr); if (pud_none(*pudp)) { printf("No valid PUD\n"); From 97239d9bbe9e55175a5a6bd7afae5dbf4f5447de Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 4 Jun 2020 16:33:01 +1000 Subject: [PATCH 111/241] fixup for powerpc ptdump.c Signed-off-by: Stephen Rothwell --- arch/powerpc/mm/ptdump/ptdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 36958c013e6571..3209f78297ad64 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -335,12 +335,12 @@ static void walk_pagetables(struct pg_state *st) for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { p4d_t *p4d = p4d_offset(pgd, 0); - if (pgd_none(*pgd) || pgd_is_leaf(*pgd)) + if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)pgd, addr, PGDIR_SHIFT, 1); + walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); else - /* pgd exists */ + /* p4d exists */ walk_pud(st, p4d, addr); } } From 5319f65de9e4db65559a8f188cd60bff06f0e601 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:14 +1000 Subject: [PATCH 112/241] powerpc/xmon: drop unused pgdir varialble in show_pte() function The kernel build robot complained: arch/powerpc/xmon/xmon.c: In function 'show_pte': >> arch/powerpc/xmon/xmon.c:3138:16: warning: variable 'pgdir' set but not >> used [-Wunused-but-set-variable] 3138 | pgd_t *pgdp, *pgdir; | ^~~~~ Remove the unused pgdir variable and adjust if () else statement to comply with the coding style. Link: http://lkml.kernel.org/r/20200519181454.GI1059226@linux.ibm.com Reported-by: kbuild test robot Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/powerpc/xmon/xmon.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index eb959cc619cada..bfd15293337614 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3213,7 +3213,7 @@ static void show_pte(unsigned long addr) unsigned long tskv = 0; struct task_struct *tsk = NULL; struct mm_struct *mm; - pgd_t *pgdp, *pgdir; + pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; @@ -3238,13 +3238,10 @@ static void show_pte(unsigned long addr) catch_memory_errors = 1; sync(); - if (mm == &init_mm) { + if (mm == &init_mm) pgdp = pgd_offset_k(addr); - pgdir = pgd_offset_k(0); - } else { + else pgdp = pgd_offset(mm, addr); - pgdir = pgd_offset(mm, 0); - } p4dp = p4d_offset(pgdp, addr); From 0bf77d759b73b7319e81d4c578b26d390041369e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 4 Jun 2020 11:46:15 +1000 Subject: [PATCH 113/241] sh: fault: Modernize printing of kernel messages - Convert from printk() to pr_*(), - Add missing continuations, - Use "%llx" to format u64, - Join multiple prints in show_fault_oops() into a single print. Link: http://lkml.kernel.org/r/20200414153455.21744-10-rppt@kernel.org Signed-off-by: Geert Uytterhoeven Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/mm/fault.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index f5da8f5ea3899c..5236cd29b7dd8d 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -47,10 +47,10 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) pgd = swapper_pg_dir; } - printk(KERN_ALERT "pgd = %p\n", pgd); + pr_alert("pgd = %p\n", pgd); pgd += pgd_index(addr); - printk(KERN_ALERT "[%08lx] *pgd=%0*Lx", addr, - (u32)(sizeof(*pgd) * 2), (u64)pgd_val(*pgd)); + pr_alert("[%08lx] *pgd=%0*llx", addr, (u32)(sizeof(*pgd) * 2), + (u64)pgd_val(*pgd)); do { pud_t *pud; @@ -61,33 +61,33 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; if (pgd_bad(*pgd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pud = pud_offset(pgd, addr); if (PTRS_PER_PUD != 1) - printk(", *pud=%0*Lx", (u32)(sizeof(*pud) * 2), - (u64)pud_val(*pud)); + pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2), + (u64)pud_val(*pud)); if (pud_none(*pud)) break; if (pud_bad(*pud)) { - printk("(bad)"); + pr_cont("(bad)"); break; } pmd = pmd_offset(pud, addr); if (PTRS_PER_PMD != 1) - printk(", *pmd=%0*Lx", (u32)(sizeof(*pmd) * 2), - (u64)pmd_val(*pmd)); + pr_cont(", *pmd=%0*llx", (u32)(sizeof(*pmd) * 2), + (u64)pmd_val(*pmd)); if (pmd_none(*pmd)) break; if (pmd_bad(*pmd)) { - printk("(bad)"); + pr_cont("(bad)"); break; } @@ -96,11 +96,11 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; pte = pte_offset_kernel(pmd, addr); - printk(", *pte=%0*Lx", (u32)(sizeof(*pte) * 2), - (u64)pte_val(*pte)); + pr_cont(", *pte=%0*llx", (u32)(sizeof(*pte) * 2), + (u64)pte_val(*pte)); } while (0); - printk("\n"); + pr_cont("\n"); } static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) @@ -188,14 +188,12 @@ show_fault_oops(struct pt_regs *regs, unsigned long address) if (!oops_may_print()) return; - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %08lx\n", address); printk(KERN_ALERT "PC:"); + pr_alert("BUG: unable to handle kernel %s at %08lx\n", + address < PAGE_SIZE ? "NULL pointer dereference" + : "paging request", + address); + pr_alert("PC:"); printk_address(regs->pc, 1, KERN_ALERT); show_pte(NULL, address); From 27d4bfa01b33fcc3946ccd44a951f4cb2ae42c16 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:15 +1000 Subject: [PATCH 114/241] sh: drop __pXd_offset() macros that duplicate pXd_index() ones The __pXd_offset() macros are identical to the pXd_index() macros and there is no point to keep both of them. All architectures define and use pXd_index() so let's keep only those to make mips consistent with the rest of the kernel. Link: http://lkml.kernel.org/r/20200414153455.21744-11-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/pgtable_32.h | 5 ++--- arch/sh/mm/init.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 29274f0e428ea7..4acce5f2cbf9a7 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -407,13 +407,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* to find an entry in a page-table-directory. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define __pgd_offset(address) pgd_index(address) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define __pud_offset(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define __pmd_offset(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) /* Find an entry in the third-level page table.. */ #define pte_index(address) ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 628f461b899313..ddeeaa56760085 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -172,9 +172,9 @@ void __init page_table_range_init(unsigned long start, unsigned long end, unsigned long vaddr; vaddr = start; - i = __pgd_offset(vaddr); - j = __pud_offset(vaddr); - k = __pmd_offset(vaddr); + i = pgd_index(vaddr); + j = pud_index(vaddr); + k = pmd_index(vaddr); pgd = pgd_base + i; for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { From 277c3d7a2da3574fa5582b24251676ba14a2104b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:16 +1000 Subject: [PATCH 115/241] sh: add support for folded p4d page tables Implement primitives necessary for the 4th level folding, add walks of p4d level where appropriate and remove usage of __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-12-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/sh/include/asm/pgtable-2level.h | 1 - arch/sh/include/asm/pgtable-3level.h | 1 - arch/sh/kernel/io_trapped.c | 7 ++++++- arch/sh/mm/cache-sh4.c | 4 +++- arch/sh/mm/fault.c | 26 +++++++++++++++++++++++--- arch/sh/mm/hugetlbpage.c | 28 ++++++++++++++++++---------- arch/sh/mm/init.c | 9 ++++++++- arch/sh/mm/kmap.c | 2 +- arch/sh/mm/tlbex_32.c | 6 +++++- 9 files changed, 64 insertions(+), 20 deletions(-) diff --git a/arch/sh/include/asm/pgtable-2level.h b/arch/sh/include/asm/pgtable-2level.h index bf1eb51c3ee5ab..08bff93927ffde 100644 --- a/arch/sh/include/asm/pgtable-2level.h +++ b/arch/sh/include/asm/pgtable-2level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_2LEVEL_H #define __ASM_SH_PGTABLE_2LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include /* diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 779260b721cae8..0f80097e5c9c4e 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -2,7 +2,6 @@ #ifndef __ASM_SH_PGTABLE_3LEVEL_H #define __ASM_SH_PGTABLE_3LEVEL_H -#define __ARCH_USE_5LEVEL_HACK #include /* diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index 60c828a2b8a2c1..037aab2708b7ac 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -136,6 +136,7 @@ EXPORT_SYMBOL_GPL(match_trapped_io_handler); static struct trapped_io *lookup_tiop(unsigned long address) { pgd_t *pgd_k; + p4d_t *p4d_k; pud_t *pud_k; pmd_t *pmd_k; pte_t *pte_k; @@ -145,7 +146,11 @@ static struct trapped_io *lookup_tiop(unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud_k = pud_offset(pgd_k, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index eee911422cf9eb..45943bcb7042d7 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -209,6 +209,7 @@ static void sh4_flush_cache_page(void *args) unsigned long address, pfn, phys; int map_coherent = 0; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -224,7 +225,8 @@ static void sh4_flush_cache_page(void *args) return; pgd = pgd_offset(vma->vm_mm, address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 5236cd29b7dd8d..67d0e739897ce9 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -53,6 +53,7 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) (u64)pgd_val(*pgd)); do { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -65,7 +66,20 @@ static void show_pte(struct mm_struct *mm, unsigned long addr) break; } - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (PTRS_PER_P4D != 1) + pr_cont(", *p4d=%0*Lx", (u32)(sizeof(*p4d) * 2), + (u64)p4d_val(*p4d)); + + if (p4d_none(*p4d)) + break; + + if (p4d_bad(*p4d)) { + pr_cont("(bad)"); + break; + } + + pud = pud_offset(p4d, addr); if (PTRS_PER_PUD != 1) pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2), (u64)pud_val(*pud)); @@ -107,6 +121,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -116,8 +131,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pgd_present(*pgd_k)) return NULL; - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 960deb1f24a17b..acd5652a0de3df 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -26,17 +26,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (p4d) { + pud = pud_alloc(mm, p4d, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + } } } @@ -47,17 +51,21 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (pgd) { - pud = pud_offset(pgd, addr); - if (pud) { - pmd = pmd_offset(pud, addr); - if (pmd) - pte = pte_offset_map(pmd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d) { + pud = pud_offset(p4d, addr); + if (pud) { + pmd = pmd_offset(pud, addr); + if (pmd) + pte = pte_offset_map(pmd, addr); + } } } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ddeeaa56760085..a70ba0fdd0b382 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -45,6 +45,7 @@ void __init __weak plat_mem_setup(void) static pte_t *__get_pte_phys(unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -54,7 +55,13 @@ static pte_t *__get_pte_phys(unsigned long addr) return NULL; } - pud = pud_alloc(NULL, pgd, addr); + p4d = p4d_alloc(NULL, pgd, addr); + if (unlikely(!p4d)) { + p4d_ERROR(*p4d); + return NULL; + } + + pud = pud_alloc(NULL, p4d, addr); if (unlikely(!pud)) { pud_ERROR(*pud); return NULL; diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 9e6b38b03cf7b2..0e7039137f5a44 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -15,7 +15,7 @@ #include #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(p4d_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr)), vaddr) static pte_t *kmap_coherent_pte; diff --git a/arch/sh/mm/tlbex_32.c b/arch/sh/mm/tlbex_32.c index 382262dc0c4bc5..1c53868632ee4c 100644 --- a/arch/sh/mm/tlbex_32.c +++ b/arch/sh/mm/tlbex_32.c @@ -23,6 +23,7 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -42,7 +43,10 @@ handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, pgd = pgd_offset(current->mm, address); } - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none_or_clear_bad(p4d)) + return 1; + pud = pud_offset(p4d, address); if (pud_none_or_clear_bad(pud)) return 1; pmd = pmd_offset(pud, address); From f8e871025f83eff062faebdc1eb2542b3a395ef3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:17 +1000 Subject: [PATCH 116/241] unicore32: remove __ARCH_USE_5LEVEL_HACK The unicore32 architecture has 2 level page tables and asm-generic/pgtable-nopmd.h and explicit casts from pud_t to pgd_t for page table folding. Add p4d walk in the only place that actually unfolds the pud level and remove __ARCH_USE_5LEVEL_HACK. Link: http://lkml.kernel.org/r/20200414153455.21744-13-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/unicore32/include/asm/pgtable.h | 1 - arch/unicore32/kernel/hibernate.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 3b8731b3a93786..826f49edd94e13 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -9,7 +9,6 @@ #ifndef __UNICORE_PGTABLE_H__ #define __UNICORE_PGTABLE_H__ -#define __ARCH_USE_5LEVEL_HACK #include #include diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c index f3812245cc00d7..ccad051a79b64e 100644 --- a/arch/unicore32/kernel/hibernate.c +++ b/arch/unicore32/kernel/hibernate.c @@ -33,9 +33,11 @@ struct swsusp_arch_regs swsusp_arch_regs_cpu0; static pmd_t *resume_one_md_table_init(pgd_t *pgd) { pud_t *pud; + p4d_t *p4d; pmd_t *pmd_table; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; From 7c96cf58837b78af3bd96c209cdce05cc3aa3739 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:17 +1000 Subject: [PATCH 117/241] asm-generic: remove pgtable-nop4d-hack.h No architecture defines __ARCH_USE_5LEVEL_HACK and therefore pgtable-nop4d-hack.h will be never actually included. Remove it. Link: http://lkml.kernel.org/r/20200414153455.21744-14-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/asm-generic/pgtable-nop4d-hack.h | 64 ------------------------ include/asm-generic/pgtable-nopud.h | 4 -- 2 files changed, 68 deletions(-) delete mode 100644 include/asm-generic/pgtable-nop4d-hack.h diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h deleted file mode 100644 index 829bdb0d6327d7..00000000000000 --- a/include/asm-generic/pgtable-nop4d-hack.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PGTABLE_NOP4D_HACK_H -#define _PGTABLE_NOP4D_HACK_H - -#ifndef __ASSEMBLY__ -#include - -#define __PAGETABLE_PUD_FOLDED 1 - -/* - * Having the pud type consist of a pgd gets the size right, and allows - * us to conceptually access the pgd entry that this pud is folded into - * without casting. - */ -typedef struct { pgd_t pgd; } pud_t; - -#define PUD_SHIFT PGDIR_SHIFT -#define PTRS_PER_PUD 1 -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* - * The "pgd_xxx()" functions here are trivial for a folded two-level - * setup: the pud is never bad, and a pud always exists (as it's folded - * into the pgd entry) - */ -static inline int pgd_none(pgd_t pgd) { return 0; } -static inline int pgd_bad(pgd_t pgd) { return 0; } -static inline int pgd_present(pgd_t pgd) { return 1; } -static inline void pgd_clear(pgd_t *pgd) { } -#define pud_ERROR(pud) (pgd_ERROR((pud).pgd)) - -#define pgd_populate(mm, pgd, pud) do { } while (0) -#define pgd_populate_safe(mm, pgd, pud) do { } while (0) -/* - * (puds are folded into pgds so this doesn't get actually called, - * but the define is needed for a generic inline function.) - */ -#define set_pgd(pgdptr, pgdval) set_pud((pud_t *)(pgdptr), (pud_t) { pgdval }) - -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) -{ - return (pud_t *)pgd; -} - -#define pud_val(x) (pgd_val((x).pgd)) -#define __pud(x) ((pud_t) { __pgd(x) }) - -#define pgd_page(pgd) (pud_page((pud_t){ pgd })) -#define pgd_page_vaddr(pgd) (pud_page_vaddr((pud_t){ pgd })) - -/* - * allocating and freeing a pud is trivial: the 1-entry pud is - * inside the pgd, so has no extra memory associated with it. - */ -#define pud_alloc_one(mm, address) NULL -#define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x, a) do { } while (0) - -#undef pud_addr_end -#define pud_addr_end(addr, end) (end) - -#endif /* __ASSEMBLY__ */ -#endif /* _PGTABLE_NOP4D_HACK_H */ diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index d3776cb494c0b3..ad05c1684bfc54 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -4,9 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef __ARCH_USE_5LEVEL_HACK -#include -#else #include #define __PAGETABLE_PUD_FOLDED 1 @@ -65,5 +62,4 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) #define pud_addr_end(addr, end) (end) #endif /* __ASSEMBLY__ */ -#endif /* !__ARCH_USE_5LEVEL_HACK */ #endif /* _PGTABLE_NOPUD_H */ From e9a866156497d55cc0bb0b4227672855ca90a538 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:18 +1000 Subject: [PATCH 118/241] mm: remove __ARCH_HAS_5LEVEL_HACK and include/asm-generic/5level-fixup.h There are no architectures that use include/asm-generic/5level-fixup.h therefore it can be removed along with __ARCH_HAS_5LEVEL_HACK define and the code it surrounds Link: http://lkml.kernel.org/r/20200414153455.21744-15-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: James Morse Cc: Jonas Bonn Cc: Julien Thierry Cc: Ley Foon Tan Cc: Marc Zyngier Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Suzuki K Poulose Cc: Tony Luck Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/asm-generic/5level-fixup.h | 59 ------------------------------ include/linux/mm.h | 7 ---- mm/kasan/init.c | 11 ------ mm/memory.c | 8 ---- 4 files changed, 85 deletions(-) delete mode 100644 include/asm-generic/5level-fixup.h diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h deleted file mode 100644 index 58046ddc08d02b..00000000000000 --- a/include/asm-generic/5level-fixup.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _5LEVEL_FIXUP_H -#define _5LEVEL_FIXUP_H - -#define __ARCH_HAS_5LEVEL_HACK -#define __PAGETABLE_P4D_FOLDED 1 - -#define P4D_SHIFT PGDIR_SHIFT -#define P4D_SIZE PGDIR_SIZE -#define P4D_MASK PGDIR_MASK -#define MAX_PTRS_PER_P4D 1 -#define PTRS_PER_P4D 1 - -#define p4d_t pgd_t - -#define pud_alloc(mm, p4d, address) \ - ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ - NULL : pud_offset(p4d, address)) - -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_alloc_track(mm, pgd, address, mask) (pgd) -#define p4d_offset(pgd, start) (pgd) - -#ifndef __ASSEMBLY__ -static inline int p4d_none(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_bad(p4d_t p4d) -{ - return 0; -} - -static inline int p4d_present(p4d_t p4d) -{ - return 1; -} -#endif - -#define p4d_ERROR(p4d) do { } while (0) -#define p4d_clear(p4d) pgd_clear(p4d) -#define p4d_val(p4d) pgd_val(p4d) -#define p4d_populate(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_populate_safe(mm, p4d, pud) pgd_populate(mm, p4d, pud) -#define p4d_page(p4d) pgd_page(p4d) -#define p4d_page_vaddr(p4d) pgd_page_vaddr(p4d) - -#define __p4d(x) __pgd(x) -#define set_p4d(p4dp, p4d) set_pgd(p4dp, p4d) - -#undef p4d_free_tlb -#define p4d_free_tlb(tlb, x, addr) do { } while (0) -#define p4d_free(mm, x) do { } while (0) - -#undef p4d_addr_end -#define p4d_addr_end(addr, end) (end) - -#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index b1b1431e2b475a..9546e6c5b8617d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2079,11 +2079,6 @@ int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) -/* - * The following ifdef needed to get the 5level-fixup.h header to work. - * Remove it when 5level-fixup.h has been removed. - */ -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2112,8 +2107,6 @@ static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, return p4d_offset(pgd, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ - static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcdbd..fe6be0be1f763c 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_early_shadow_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. */ -#ifndef __ARCH_HAS_5LEVEL_HACK pgd_populate(&init_mm, pgd, lm_alias(kasan_early_shadow_p4d)); -#endif p4d = p4d_offset(pgd, addr); p4d_populate(&init_mm, p4d, lm_alias(kasan_early_shadow_pud)); diff --git a/mm/memory.c b/mm/memory.c index 58a74acafcd0ac..ac799521f5addd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4436,19 +4436,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); -#else - if (!pgd_present(*p4d)) { - mm_inc_nr_puds(mm); - pgd_populate(mm, p4d, new); - } else /* Another has populated it */ - pud_free(mm, new); -#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } From 65a7b4807c45811af598cb3fcfef45d77394b01b Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 4 Jun 2020 11:46:18 +1000 Subject: [PATCH 119/241] net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles (as reported by perf) drop from a couple of percentage points to a fraction of a percent. This results in a roughly 6% increase in efficiency, measured roughly as zerocopy receive count divided by CPU utilization. The intention of this patchset is to reduce atomic ops for tcp zerocopy receives, which normally hits the same spinlock multiple times consecutively. [akpm@linux-foundation.org: suppress gcc-7.2.0 warning] Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.com Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Cc: David Miller Cc: Matthew Wilcox Cc: Jason Gunthorpe Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- net/ipv4/tcp.c | 70 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 15d47d5e795105..ecbba0abd3e5b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, + struct page **pages, + unsigned long pages_to_map, + unsigned long *insert_addr, + u32 *length_with_pending, + u32 *seq, + struct tcp_zerocopy_receive *zc) +{ + unsigned long pages_remaining = pages_to_map; + int bytes_mapped; + int ret; + + ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); + bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + /* Even if vm_insert_pages fails, it may have partially succeeded in + * mapping (some but not all of the pages). + */ + *seq += bytes_mapped; + *insert_addr += bytes_mapped; + if (ret) { + /* But if vm_insert_pages did fail, we have to unroll some state + * we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + *length_with_pending -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return ret; +} + static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; u32 length = 0, seq, offset, zap_len; + #define PAGE_BATCH_SIZE 8 + struct page *pages[PAGE_BATCH_SIZE]; const skb_frag_t *frags = NULL; struct vm_area_struct *vma; struct sk_buff *skb = NULL; + unsigned long pg_idx = 0; + unsigned long curr_addr; struct tcp_sock *tp; int inq; int ret; @@ -1762,6 +1796,8 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); + tp = tcp_sk(sk); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, address); @@ -1771,7 +1807,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - tp = tcp_sk(sk); seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); @@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint = zc->length; } ret = 0; + curr_addr = address; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { + /* If we're here, finish the current batch. */ + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pg_idx, + &curr_addr, + &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } - zc->recv_skip_hint = skb->len - offset; offset -= skb_headlen(skb); if ((int)offset < 0 || skb_has_frag_list(skb)) @@ -1817,14 +1863,24 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint -= remaining; break; } - ret = vm_insert_page(vma, address + length, - skb_frag_page(frags)); - if (ret) - break; + pages[pg_idx] = skb_frag_page(frags); + pg_idx++; length += PAGE_SIZE; - seq += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; + if (pg_idx == PAGE_BATCH_SIZE) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } + } + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, &seq, + zc); } out: up_read(¤t->mm->mmap_sem); From 2a1b1466a5fcb42dc791b859299a73e47e3892c5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 4 Jun 2020 11:46:19 +1000 Subject: [PATCH 120/241] mm/mmap.c: add more sanity checks to get_unmapped_area() Generic get_unmapped_area() function does sanity checks of address and length of the area to be mapped. Yet, it lacks checking against mmap_min_addr and mmap_end limits. At the same time the default implementation of functions arch_get_unmapped_area[_topdown]() and some architecture callbacks do mmap_min_addr and mmap_end checks on their own. Put additional checks into the generic code and do not let architecture callbacks to get away with a possible area outside of the allowed limits. That could also relieve arch_get_unmapped_area[_topdown]() callbacks of own address and length sanity checks. Link: http://lkml.kernel.org/r/d14f2cff3c891ef2c4b0337d737c6f04beacb124.1584958099.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1811918706d82a..5b93fbc69c5e48 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2208,12 +2208,13 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + const unsigned long mmap_end = arch_get_mmap_end(addr); unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ - if (len > TASK_SIZE) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; get_area = current->mm->get_unmapped_area; @@ -2234,7 +2235,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (IS_ERR_VALUE(addr)) return addr; - if (addr > TASK_SIZE - len) + if ((addr < mmap_min_addr) || (addr > mmap_end - len)) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; From 6693fac5fce28f16863dde95bb010c9a607f8f42 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 4 Jun 2020 11:46:20 +1000 Subject: [PATCH 121/241] mm/mmap.c: do not allow mappings outside of allowed limits One can set a lowest possible address in /proc/sys/vm/mmap_min_addr and mmap below that bound nevertheless. It is possible to request a fixed mapping address below mmap_min_addr and succeed. This update adds early checks of mmap_min_addr and mmap_end boundaries and fixes the above issue. Apart from it is wrong I am not aware of any existing issue. Link: http://lkml.kernel.org/r/d6da1319114a331095052638f0ffa3ccb0be58f1.1584958099.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- mm/mmap.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5b93fbc69c5e48..9ff93ad069fc49 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -62,6 +62,14 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; @@ -1369,6 +1377,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { + const unsigned long mmap_end = arch_get_mmap_end(addr); struct mm_struct *mm = current->mm; int pkey = 0; @@ -1391,8 +1400,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; - if (!(flags & MAP_FIXED)) + if (flags & MAP_FIXED) { + if ((addr < mmap_min_addr) || (addr > mmap_end)) + return -ENOMEM; + } else { addr = round_hint_to_min(addr); + } /* Careful about overflows.. */ len = PAGE_ALIGN(len); @@ -2089,14 +2102,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * From a12dd58107e759e286642b708229cd50d7004e9a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Jun 2020 11:46:22 +1000 Subject: [PATCH 122/241] mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- arch/alpha/boot/bootp.c | 1 - arch/alpha/boot/bootpz.c | 1 - arch/alpha/boot/main.c | 1 - arch/alpha/include/asm/io.h | 1 - arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/ptrace.c | 1 - arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/smp.c | 1 - arch/alpha/kernel/sys_alcor.c | 1 - arch/alpha/kernel/sys_cabriolet.c | 1 - arch/alpha/kernel/sys_dp264.c | 1 - arch/alpha/kernel/sys_eb64p.c | 1 - arch/alpha/kernel/sys_eiger.c | 1 - arch/alpha/kernel/sys_jensen.c | 1 - arch/alpha/kernel/sys_marvel.c | 1 - arch/alpha/kernel/sys_miata.c | 1 - arch/alpha/kernel/sys_mikasa.c | 1 - arch/alpha/kernel/sys_nautilus.c | 1 - arch/alpha/kernel/sys_noritake.c | 1 - arch/alpha/kernel/sys_rawhide.c | 1 - arch/alpha/kernel/sys_ruffian.c | 1 - arch/alpha/kernel/sys_rx164.c | 1 - arch/alpha/kernel/sys_sable.c | 1 - arch/alpha/kernel/sys_sio.c | 1 - arch/alpha/kernel/sys_sx164.c | 1 - arch/alpha/kernel/sys_takara.c | 1 - arch/alpha/kernel/sys_titan.c | 1 - arch/alpha/kernel/sys_wildfire.c | 1 - arch/alpha/mm/init.c | 1 - arch/arm/kernel/machine_kexec.c | 1 - arch/arm/kernel/module.c | 1 - arch/arm/kernel/ptrace.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-footbridge/common.c | 1 - arch/arm/mach-imx/mm-imx21.c | 1 - arch/arm/mach-imx/mm-imx27.c | 1 - arch/arm/mach-imx/mm-imx3.c | 1 - arch/arm/mach-iop32x/i2c.c | 1 - arch/arm/mach-iop32x/iq31244.c | 1 - arch/arm/mach-iop32x/iq80321.c | 1 - arch/arm/mach-iop32x/n2100.c | 1 - arch/arm/mach-ixp4xx/common.c | 1 - arch/arm/mach-sa1100/assabet.c | 1 - arch/arm/mm/copypage-v4mc.c | 1 - arch/arm/mm/copypage-v6.c | 1 - arch/arm/mm/copypage-xscale.c | 1 - arch/arm/mm/dump.c | 1 - arch/arm/mm/fault-armv.c | 1 - arch/arm/mm/fault.c | 1 - arch/arm/mm/pageattr.c | 1 - arch/arm64/kernel/hibernate.c | 1 - arch/arm64/kernel/ptrace.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/dump.c | 1 - arch/arm64/mm/fault.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/pageattr.c | 1 - arch/csky/kernel/module.c | 1 - arch/csky/kernel/ptrace.c | 1 - arch/csky/mm/init.c | 1 - arch/csky/mm/tlb.c | 1 - arch/h8300/kernel/process.c | 1 - arch/h8300/kernel/setup.c | 1 - arch/h8300/kernel/signal.c | 1 - arch/h8300/mm/fault.c | 1 - arch/h8300/mm/init.c | 1 - arch/h8300/mm/memory.c | 1 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/kernel/efi.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/ia64/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/ia64/mm/contig.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/68000/timers.c | 1 - arch/m68k/amiga/config.c | 1 - arch/m68k/apollo/config.c | 1 - arch/m68k/atari/atasound.c | 1 - arch/m68k/atari/stram.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68k/kernel/ptrace.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/signal.c | 1 - arch/m68k/kernel/uboot.c | 1 - arch/m68k/mac/config.c | 1 - arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/sun3kmap.c | 1 - arch/m68k/mm/sun3mmu.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - arch/m68k/q40/config.c | 1 - arch/m68k/sun3/config.c | 1 - arch/m68k/sun3/dvma.c | 1 - arch/m68k/sun3/mmu_emu.c | 1 - arch/m68k/sun3/sun3dvma.c | 1 - arch/m68k/sun3x/dvma.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/mm/fault.c | 1 - arch/mips/fw/arc/memory.c | 1 - arch/mips/include/asm/mach-generic/floppy.h | 1 - arch/mips/include/asm/mach-jazz/floppy.h | 1 - arch/mips/jazz/jazzdma.c | 1 - arch/mips/kernel/module.c | 1 - arch/mips/kernel/process.c | 1 - arch/mips/kernel/ptrace.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/kernel/smp-bmips.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kvm/tlb.c | 1 - arch/mips/lib/dump_tlb.c | 1 - arch/mips/lib/r3k_dump_tlb.c | 1 - arch/mips/mm/c-octeon.c | 1 - arch/mips/mm/c-r3k.c | 1 - arch/mips/mm/c-r4k.c | 1 - arch/mips/mm/c-tx39.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/mm/page.c | 1 - arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - arch/mips/mm/sc-ip22.c | 1 - arch/mips/mm/sc-mips.c | 1 - arch/mips/mm/sc-r5k.c | 1 - arch/mips/mm/tlb-r3k.c | 1 - arch/mips/mm/tlb-r4k.c | 1 - arch/mips/sgi-ip27/ip27-init.c | 1 - arch/mips/sgi-ip27/ip27-timer.c | 1 - arch/mips/sgi-ip32/ip32-memory.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nds32/mm/proc.c | 1 - arch/nios2/kernel/module.c | 1 - arch/nios2/mm/init.c | 1 - arch/nios2/mm/pgtable.c | 1 - arch/nios2/mm/tlb.c | 1 - arch/openrisc/include/asm/tlbflush.h | 1 - arch/openrisc/kernel/asm-offsets.c | 1 - arch/openrisc/kernel/process.c | 1 - arch/openrisc/kernel/ptrace.c | 1 - arch/openrisc/kernel/setup.c | 1 - arch/openrisc/kernel/traps.c | 1 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/tlb.c | 1 - arch/parisc/include/asm/mmu_context.h | 1 - arch/parisc/kernel/module.c | 1 - arch/parisc/kernel/ptrace.c | 1 - arch/parisc/kernel/smp.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/include/asm/io.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/process.c | 1 - arch/powerpc/kernel/signal_32.c | 1 - arch/powerpc/kernel/signal_64.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/vdso.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/radix_pgtable.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/init_32.c | 1 - arch/powerpc/mm/init_64.c | 1 - arch/powerpc/mm/mem.c | 1 - arch/powerpc/mm/nohash/40x.c | 1 - arch/powerpc/mm/nohash/fsl_booke.c | 1 - arch/powerpc/mm/pgtable_32.c | 1 - arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/mm/ptdump/hashpagetable.c | 1 - arch/powerpc/mm/ptdump/ptdump.c | 1 - arch/powerpc/perf/callchain.c | 1 - arch/powerpc/perf/callchain_32.c | 1 - arch/powerpc/perf/callchain_64.c | 1 - arch/powerpc/platforms/8xx/cpm1.c | 1 - arch/powerpc/platforms/8xx/micropatch.c | 1 - arch/powerpc/platforms/cell/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/maple/setup.c | 1 - arch/powerpc/platforms/maple/time.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - arch/powerpc/platforms/powermac/time.c | 1 - arch/powerpc/platforms/pseries/setup.c | 1 - arch/powerpc/sysdev/cpm2.c | 1 - arch/powerpc/xmon/xmon.c | 1 - arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/include/asm/tlbflush.h | 1 - arch/s390/kernel/machine_kexec.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/kernel/vdso.c | 1 - arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/fault.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pageattr.c | 1 - arch/s390/mm/pgtable.c | 1 - arch/s390/mm/vmem.c | 1 - arch/sh/kernel/machine_kexec.c | 1 - arch/sh/kernel/ptrace_32.c | 1 - arch/sh/kernel/signal_32.c | 1 - arch/sh/mm/cache-sh3.c | 1 - arch/sh/mm/cache-sh4.c | 1 - arch/sh/mm/cache-sh7705.c | 1 - arch/sh/mm/nommu.c | 1 - arch/sparc/kernel/leon_smp.c | 1 - arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/setup_32.c | 1 - arch/sparc/kernel/setup_64.c | 1 - arch/sparc/kernel/signal32.c | 1 - arch/sparc/kernel/signal_32.c | 1 - arch/sparc/kernel/signal_64.c | 1 - arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 1 - arch/sparc/kernel/traps_64.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/sparc/mm/hugetlbpage.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - arch/sparc/mm/srmmu.c | 1 - arch/sparc/mm/tlb.c | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/skas/mmu.c | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/um/kernel/tlb.c | 1 - arch/um/kernel/trap.c | 1 - arch/um/kernel/um_arch.c | 1 - arch/unicore32/kernel/module.c | 1 - arch/unicore32/mm/fault.c | 1 - arch/x86/include/asm/iomap.h | 1 - arch/x86/include/asm/xen/page.h | 1 - arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - arch/x86/mm/pat/cpa-test.c | 1 - arch/x86/mm/pat/memtype.c | 1 - arch/x86/mm/pgtable.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/mm/pti.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/enlighten_pv.c | 1 - arch/x86/xen/grant-table.c | 1 - arch/xtensa/kernel/process.c | 1 - arch/xtensa/kernel/ptrace.c | 1 - arch/xtensa/kernel/setup.c | 1 - drivers/char/agp/frontend.c | 1 - drivers/char/agp/generic.c | 1 - drivers/char/bsr.c | 1 - drivers/char/mspec.c | 1 - drivers/gpu/drm/i915/i915_mm.c | 1 - drivers/infiniband/sw/rdmavt/mmap.c | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - drivers/media/platform/davinci/vpbe_display.c | 1 - drivers/media/v4l2-core/v4l2-common.c | 1 - drivers/misc/sgi-gru/grufault.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/a2091.c | 1 - drivers/scsi/a3000.c | 1 - drivers/scsi/gvp11.c | 1 - drivers/scsi/lasi700.c | 1 - drivers/scsi/mvme147.c | 1 - drivers/scsi/sni_53c710.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/fbdev/acornfb.c | 1 - drivers/video/fbdev/atafb.c | 1 - drivers/video/fbdev/cirrusfb.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/fb-puv3.c | 1 - drivers/video/fbdev/hitfb.c | 1 - drivers/video/fbdev/neofb.c | 1 - drivers/video/fbdev/q40fb.c | 1 - drivers/video/fbdev/savage/savagefb_driver.c | 1 - drivers/xen/balloon.c | 1 - drivers/xen/grant-table.c | 1 - drivers/xen/privcmd.c | 1 - drivers/xen/xenbus/xenbus_probe.c | 1 - drivers/xen/xenbus/xenbus_probe_backend.c | 1 - drivers/xen/xenbus/xenbus_probe_frontend.c | 1 - fs/proc/array.c | 1 - fs/proc/meminfo.c | 1 - fs/proc/nommu.c | 1 - fs/proc/vmcore.c | 1 - include/linux/dax.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - lib/ioremap.c | 1 - mm/debug_vm_pgtable.c | 1 - mm/gup.c | 1 - mm/hugetlb.c | 1 - mm/memory.c | 1 - mm/page_io.c | 1 - mm/shmem.c | 1 - mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - mm/swap_state.c | 1 - mm/swapfile.c | 1 - mm/vmacache.c | 1 - sound/core/sgbuf.c | 1 - virt/kvm/kvm_main.c | 1 - 320 files changed, 320 deletions(-) diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f485895..00266e6e1b7148 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344bff..43af71835adf8a 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed861097037..e5347a08000867 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5df..13bea465f1c063 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a86..b45f0b0d6511bf 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d661..8c43212ae38e6b 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b464..f5c42a8fcf9c83 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20a6..9f1354f6c6ab33 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65f9..e063b3857b3d56 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f8c..47459b73cdb7e2 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d335086218207d..9fb445d7dca52c 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb98735..3c43fd34752660 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141a5..bf99dcfd40c4e3 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77c9..0a2ab6cb18db49 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1da..83d6c53d6d4d14 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225d5..e1bee8f84c5877 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e170..7690dfd57cb6bc 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b0c..53adf43dcd44fc 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f76526..47f3ce4f719ada 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b54f..b5846ffdadce5b 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d3307401196029..4b1c8d85c4f083 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa49..94046f9aea08d4 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d80a..930005b2f630dd 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47adb7..7c420d8dac53d9 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c803..dd9de84b630c35 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c68640883e..9e2adb69bc74e8 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de14003..b1f3b4fcf99b2c 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fdef..2c54d707142ab7 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5b4..3c42b3147fd6f0 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e89b..974b6c64d3e6fb 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24ead..e15444b25ca050 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff6359a..d0f7c8896c9678 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a817..9a6432557871fe 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759f6..5960e3dfd2bfc0 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d02..eee095f0e2f6c2 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include