From c8eaf6ac76f40f6c59fc7d056e2e08c4a57ea9c7 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 28 Jan 2022 17:50:25 +0800 Subject: sched: move autogroup sysctls into its own file move autogroup sysctls to autogroup.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220128095025.8745-1-nizhen@uniontech.com --- kernel/sysctl.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..1cb7ca68cd4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1750,17 +1750,6 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_sched_uclamp_handler, }, #endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -- cgit v1.2.3 From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 47 insertions(+), 6 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 79c52dd6c597..0a4267c63d3b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..730ab56d9e92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ -- cgit v1.2.3 From c574bbe917036c8968b984c82c7b13194fe5ce98 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 22 Mar 2022 14:46:23 -0700 Subject: NUMA balancing: optimize page placement for memory tiering system With the advent of various new memory types, some machines will have multiple types of memory, e.g. DRAM and PMEM (persistent memory). The memory subsystem of these machines can be called memory tiering system, because the performance of the different types of memory are usually different. In such system, because of the memory accessing pattern changing etc, some pages in the slow memory may become hot globally. So in this patch, the NUMA balancing mechanism is enhanced to optimize the page placement among the different memory types according to hot/cold dynamically. In a typical memory tiering system, there are CPUs, fast memory and slow memory in each physical NUMA node. The CPUs and the fast memory will be put in one logical node (called fast memory node), while the slow memory will be put in another (faked) logical node (called slow memory node). That is, the fast memory is regarded as local while the slow memory is regarded as remote. So it's possible for the recently accessed pages in the slow memory node to be promoted to the fast memory node via the existing NUMA balancing mechanism. The original NUMA balancing mechanism will stop to migrate pages if the free memory of the target node becomes below the high watermark. This is a reasonable policy if there's only one memory type. But this makes the original NUMA balancing mechanism almost do not work to optimize page placement among different memory types. Details are as follows. It's the common cases that the working-set size of the workload is larger than the size of the fast memory nodes. Otherwise, it's unnecessary to use the slow memory at all. So, there are almost always no enough free pages in the fast memory nodes, so that the globally hot pages in the slow memory node cannot be promoted to the fast memory node. To solve the issue, we have 2 choices as follows, a. Ignore the free pages watermark checking when promoting hot pages from the slow memory node to the fast memory node. This will create some memory pressure in the fast memory node, thus trigger the memory reclaiming. So that, the cold pages in the fast memory node will be demoted to the slow memory node. b. Define a new watermark called wmark_promo which is higher than wmark_high, and have kswapd reclaiming pages until free pages reach such watermark. The scenario is as follows: when we want to promote hot-pages from a slow memory to a fast memory, but fast memory's free pages would go lower than high watermark with such promotion, we wake up kswapd with wmark_promo watermark in order to demote cold pages and free us up some space. So, next time we want to promote hot-pages we might have a chance of doing so. The choice "a" may create high memory pressure in the fast memory node. If the memory pressure of the workload is high, the memory pressure may become so high that the memory allocation latency of the workload is influenced, e.g. the direct reclaiming may be triggered. The choice "b" works much better at this aspect. If the memory pressure of the workload is high, the hot pages promotion will stop earlier because its allocation watermark is higher than that of the normal memory allocation. So in this patch, choice "b" is implemented. A new zone watermark (WMARK_PROMO) is added. Which is larger than the high watermark and can be controlled via watermark_scale_factor. In addition to the original page placement optimization among sockets, the NUMA balancing mechanism is extended to be used to optimize page placement according to hot/cold among different memory types. So the sysctl user space interface (numa_balancing) is extended in a backward compatible way as follow, so that the users can enable/disable these functionality individually. The sysctl is converted from a Boolean value to a bits field. The definition of the flags is, - 0: NUMA_BALANCING_DISABLED - 1: NUMA_BALANCING_NORMAL - 2: NUMA_BALANCING_MEMORY_TIERING We have tested the patch with the pmbench memory accessing benchmark with the 80:20 read/write ratio and the Gauss access address distribution on a 2 socket Intel server with Optane DC Persistent Memory Model. The test results shows that the pmbench score can improve up to 95.9%. Thanks Andrew Morton to help fix the document format error. Link: https://lkml.kernel.org/r/20220221084529.1052339-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Johannes Weiner Reviewed-by: Oscar Salvador Reviewed-by: Yang Shi Cc: Michal Hocko Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Dave Hansen Cc: Zi Yan Cc: Wei Xu Cc: Shakeel Butt Cc: zhongjiang-ali Cc: Randy Dunlap Cc: Feng Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 29 ++++++++++++++++++++--------- include/linux/mmzone.h | 1 + include/linux/sched/sysctl.h | 10 ++++++++++ kernel/sched/core.c | 21 +++++++++++++++++---- kernel/sysctl.c | 2 +- mm/migrate.c | 16 ++++++++++++++-- mm/page_alloc.c | 3 ++- mm/vmscan.c | 6 +++++- 8 files changed, 70 insertions(+), 18 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index d359bcfadd39..fdfd2b684822 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst). numa_balancing ============== -Enables/disables automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes -that access it often. +Enables/disables and configures automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes that access it often. +The value to set can be the result of ORing the following: -Enables/disables automatic NUMA memory balancing. On NUMA machines, there -is a performance penalty if remote memory is accessed by a CPU. When this -feature is enabled the kernel samples what task thread is accessing memory -by periodically unmapping pages and later trapping a page fault. At the -time of the page fault, it is determined if the data being accessed should -be migrated to a local memory node. += ================================= +0 NUMA_BALANCING_DISABLED +1 NUMA_BALANCING_NORMAL +2 NUMA_BALANCING_MEMORY_TIERING += ================================= + +Or NUMA_BALANCING_NORMAL to optimize page placement among different +NUMA nodes to reduce remote accessing. On NUMA machines, there is a +performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing +memory by periodically unmapping pages and later trapping a page +fault. At the time of the page fault, it is determined if the data +being accessed should be migrated to a local memory node. The unmapping of pages and trapping faults incur additional overhead that ideally is offset by improved memory locality but there is no universal @@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls. +Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among +different types of memory (represented as different NUMA nodes) to +place the hot pages in the fast memory. This is implemented based on +unmapping and page fault too. numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb =============================================================================================================================== diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 310b6e7ce58a..962b14d403e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -353,6 +353,7 @@ enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, + WMARK_PROMO, NR_WMARK }; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c19dd5a2c05c..b5eec8854c5a 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -23,6 +23,16 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_END, }; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 + +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; +#else +#define sysctl_numa_balancing_mode 0 +#endif + /* * control realtime throttling: * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9745613d531c..da6a60383645 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4279,7 +4279,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4287,13 +4289,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4303,8 +4314,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 730ab56d9e92..3395b99d59a4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1696,7 +1696,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { diff --git a/mm/migrate.c b/mm/migrate.c index dc4adf979201..78b2cf87946d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -2031,16 +2032,27 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; int nr_pages = thp_nr_pages(page); + int order = compound_order(page); - VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, nr_pages)) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (populated_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; + } if (isolate_lru_page(page)) return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a573aa9f5160..8b18a077c409 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8441,7 +8441,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/vmscan.c b/mm/vmscan.c index f5ec53f19f3b..499fa86e754a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include #include +#include #include "internal.h" @@ -3895,7 +3896,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) if (!managed_zone(zone)) continue; - mark = high_wmark_pages(zone); + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + mark = wmark_pages(zone, WMARK_PROMO); + else + mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } -- cgit v1.2.3 From a60707d74bd1d119cf7bcc5101cda912fc46d5e3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:57 +0800 Subject: sched: Move child_runs_first sysctls to fair.c move child_runs_first sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- kernel/sched/fair.c | 19 +++++++++++++++++++ kernel/sched/sched.h | 2 ++ kernel/sysctl.c | 7 ------- 4 files changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c1076b5e17fb..1d2ff3cd1728 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,8 +14,6 @@ extern unsigned long sysctl_hung_task_timeout_secs; enum { sysctl_hung_task_timeout_secs = 0 }; #endif -extern unsigned int sysctl_sched_child_runs_first; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..788b1d6a3248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -109,6 +109,25 @@ static unsigned int sched_nr_latency = 8; * parent will (try to) run first. */ unsigned int sysctl_sched_child_runs_first __read_mostly; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_child_runs_first_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int __init sched_child_runs_first_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_child_runs_first_sysctls); + return 0; +} +late_initcall(sched_child_runs_first_sysctl_init); +#endif /* * SCHED_OTHER wake-up granularity. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58263f90c559..767fc1de9646 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -108,6 +108,8 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern unsigned int sysctl_sched_child_runs_first; + extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..6bbb8e1af675 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,13 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit v1.2.3 From f5ef06d58be8311a9425e6a54a053ecb350952f3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:58 +0800 Subject: sched: Move schedstats sysctls to core.c move schedstats sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- kernel/sched/core.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 11 ----------- 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1d2ff3cd1728..6c7a6850559b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -64,8 +64,6 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..04440da5955f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4430,7 +4430,7 @@ out: __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4449,6 +4449,26 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, set_schedstats(state); return err; } + +static struct ctl_table sched_schedstats_sysctls[] = { + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_schedstats_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_schedstats_sysctls); + return 0; +} +late_initcall(sched_schedstats_sysctl_init); #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bbb8e1af675..fc0eeca20718 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,17 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_TASK_DELAY_ACCT { .procname = "task_delayacct", -- cgit v1.2.3 From d9ab0e63fa7f8405fbb19e28c5191e0880a7f2db Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:59 +0800 Subject: sched: Move rt_period/runtime sysctls to rt.c move rt_period/runtime sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 11 ----------- kernel/rcu/rcu.h | 2 ++ kernel/sched/core.c | 13 ------------- kernel/sched/rt.c | 43 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 4 ++++ kernel/sysctl.c | 14 -------------- 6 files changed, 48 insertions(+), 39 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6c7a6850559b..4391c1307945 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,15 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -/* - * control realtime throttling: - * - * /proc/sys/kernel/sched_rt_period_us - * /proc/sys/kernel/sched_rt_runtime_us - */ -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - extern unsigned int sysctl_sched_dl_period_max; extern unsigned int sysctl_sched_dl_period_min; @@ -58,8 +49,6 @@ extern int sched_rr_timeslice; int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..7812c740b3bf 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,8 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +extern int sysctl_sched_rt_runtime; + /* * Return the counter portion of a sequence number previously returned * by rcu_seq_snap() or rcu_seq_current(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 04440da5955f..774f3229db37 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -145,12 +145,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8; const_debug unsigned int sysctl_sched_nr_migrate = 32; #endif -/* - * period over which we measure -rt task CPU usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - __read_mostly int scheduler_running; #ifdef CONFIG_SCHED_CORE @@ -444,13 +438,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - /* * Serialization rules: * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a32c46889af8..5663bb5ff890 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -13,6 +13,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + {} +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -2925,7 +2966,7 @@ static void sched_rt_do_global(void) raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 767fc1de9646..3b406c78a8e9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -114,6 +114,10 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fc0eeca20718..029bfe06c68d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, -- cgit v1.2.3 From 84227c12888b1105725cd2de14705b029bcbb4b2 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:00 +0800 Subject: sched: Move deadline_period sysctls to deadline.c move deadline_period sysctls to deadline.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 3 --- kernel/sched/deadline.c | 42 ++++++++++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 -------------- 3 files changed, 34 insertions(+), 25 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4391c1307945..7da9b94c5e1c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,9 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -extern unsigned int sysctl_sched_dl_period_max; -extern unsigned int sysctl_sched_dl_period_min; - #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..82e10b74a7b2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,40 @@ * Fabio Checconi */ +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -2879,14 +2913,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_flags |= dl_se->flags; } -/* - * Default limits for DL period; on the top end we guard against small util - * tasks still getting ridiculously long effective runtimes, on the bottom end we - * guard against timer DoS. - */ -unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ -unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ - /* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 029bfe06c68d..2c8c75e11a37 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_deadline_period_max_us", - .data = &sysctl_sched_dl_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_deadline_period_min_us", - .data = &sysctl_sched_dl_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, -- cgit v1.2.3 From dafd7a9dad22fadcb290b24dff54e2eae3b89776 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:01 +0800 Subject: sched: Move rr_timeslice sysctls to rt.c move rr_timeslice sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 5 ----- kernel/sched/rt.c | 13 +++++++++++-- kernel/sched/sched.h | 1 + kernel/sysctl.c | 7 ------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7da9b94c5e1c..3d307e512d1f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,11 +41,6 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -extern int sysctl_sched_rr_timeslice; -extern int sched_rr_timeslice; - -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5663bb5ff890..71791be36065 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,7 +5,7 @@ */ int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -27,6 +27,8 @@ int sysctl_sched_rt_runtime = 950000; static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #ifdef CONFIG_SYSCTL static struct ctl_table sched_rt_sysctls[] = { { @@ -43,6 +45,13 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, {} }; @@ -3005,7 +3014,7 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3b406c78a8e9..ae0f6e5a76f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -117,6 +117,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; /* * Helpers for converting nanosecond timing to jiffy resolution diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2c8c75e11a37..b074f70a3e11 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,13 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, #ifdef CONFIG_UCLAMP_TASK { .procname = "sched_util_clamp_min", -- cgit v1.2.3 From 3267e0156c3341ac25b37a0f60551cdae1634b60 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:02 +0800 Subject: sched: Move uclamp_util sysctls to core.c move uclamp_util sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 8 -------- kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++---------- kernel/sysctl.c | 23 --------------------- 3 files changed, 37 insertions(+), 42 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3d307e512d1f..0934b21a57a4 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,18 +31,10 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_UCLAMP_TASK -extern unsigned int sysctl_sched_uclamp_util_min; -extern unsigned int sysctl_sched_uclamp_util_max; -extern unsigned int sysctl_sched_uclamp_util_min_rt_default; -#endif - #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 774f3229db37..ef31751c5799 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1306,10 +1306,10 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* * By default RT tasks run at the maximum performance point/capacity of the @@ -1326,7 +1326,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1779,7 +1779,7 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; @@ -4436,8 +4436,12 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, set_schedstats(state); return err; } +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_SCHEDSTATS */ -static struct ctl_table sched_schedstats_sysctls[] = { +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", .data = NULL, @@ -4447,17 +4451,39 @@ static struct ctl_table sched_schedstats_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ {} }; - -static int __init sched_schedstats_sysctl_init(void) +static int __init sched_core_sysctl_init(void) { - register_sysctl_init("kernel", sched_schedstats_sysctls); + register_sysctl_init("kernel", sched_core_sysctls); return 0; } -late_initcall(sched_schedstats_sysctl_init); -#endif /* CONFIG_PROC_SYSCTL */ -#endif /* CONFIG_SCHEDSTATS */ +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ /* * fork()/clone()-time setup: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b074f70a3e11..a48c090d57f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,29 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_min_rt_default", - .data = &sysctl_sched_uclamp_util_min_rt_default, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -- cgit v1.2.3 From d4ae80ffa64f87b9c355692b680b603add084e96 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:03 +0800 Subject: sched: Move cfs_bandwidth_slice sysctls to fair.c move cfs_bandwidth_slice sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 4 ---- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++----------------- kernel/sysctl.c | 10 --------- 3 files changed, 31 insertions(+), 34 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0934b21a57a4..198f77c8a873 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,10 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 788b1d6a3248..265bf7a75a37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -109,25 +109,6 @@ static unsigned int sched_nr_latency = 8; * parent will (try to) run first. */ unsigned int sysctl_sched_child_runs_first __read_mostly; -#ifdef CONFIG_SYSCTL -static struct ctl_table sched_child_runs_first_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - {} -}; - -static int __init sched_child_runs_first_sysctl_init(void) -{ - register_sysctl_init("kernel", sched_child_runs_first_sysctls); - return 0; -} -late_initcall(sched_child_runs_first_sysctl_init); -#endif /* * SCHED_OTHER wake-up granularity. @@ -192,7 +173,37 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a48c090d57f9..10ab81c7c457 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,16 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) { .procname = "sched_energy_aware", -- cgit v1.2.3 From 8a0441415b3f9b9a920a6a5086580ea3daa7b884 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:04 +0800 Subject: sched: Move energy_aware sysctls to topology.c move energy_aware sysctls to topology.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 6 ------ kernel/sched/topology.c | 25 +++++++++++++++++++++++-- kernel/sysctl.c | 11 ----------- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 198f77c8a873..e650946816d0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -34,10 +34,4 @@ extern int sysctl_numa_balancing_mode; int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -extern unsigned int sysctl_sched_energy_aware; -int sched_energy_aware_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 810750e62118..05b6c2ad90b9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -206,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; +static unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -220,7 +220,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -237,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10ab81c7c457..8241c5401ee8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1681,17 +1681,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- cgit v1.2.3 From 06d177662fb86b80c7fc2290667b9a14cb0bd925 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Thu, 17 Feb 2022 12:23:21 +0800 Subject: kernel/reboot: move reboot sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the poweroff_cmd and ctrl-alt-del sysctls to its own file, kernel/reboot.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/reboot.h | 4 ---- kernel/reboot.c | 34 ++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 14 -------------- 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index af907a3d68d1..a2429648d831 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -71,12 +71,8 @@ extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); -extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); -#define POWEROFF_CMD_PATH_LEN 256 -extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; - extern void orderly_poweroff(bool force); extern void orderly_reboot(void); void hw_protection_shutdown(const char *reason, int ms_until_forced); diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..ed4e6dfb7d44 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -23,7 +23,7 @@ * this indicates whether you can reboot with ctrl-alt-del: the default is yes */ -int C_A_D = 1; +static int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); @@ -417,9 +417,37 @@ void ctrl_alt_del(void) kill_cad_pid(SIGINT, 1); } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +#define POWEROFF_CMD_PATH_LEN 256 +static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; static const char reboot_cmd[] = "/sbin/reboot"; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_reboot_table[] = { + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static void __init kernel_reboot_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_reboot_table); +} +#else +#define kernel_reboot_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static int run_cmd(const char *cmd) { char **argv; @@ -886,6 +914,8 @@ static int __init reboot_ksysfs_init(void) return ret; } + kernel_reboot_sysctls_init(); + return 0; } late_initcall(reboot_ksysfs_init); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8241c5401ee8..5e43569ce2be 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1798,13 +1798,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_FUNCTION_TRACER { .procname = "ftrace_enabled", @@ -2111,13 +2104,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, #ifdef CONFIG_KEYS { .procname = "keys", -- cgit v1.2.3 From 43fe219aa56a2fdd8f0623c9470a32b14b0617a5 Mon Sep 17 00:00:00 2001 From: sujiaxun Date: Thu, 17 Feb 2022 18:51:48 -0800 Subject: mm: move oom_kill sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the oom_kill sysctls to their own file, mm/oom_kill.c [sfr@canb.auug.org.au: null-terminate the array] Link: https://lkml.kernel.org/r/20220216193202.28838626@canb.auug.org.au Link: https://lkml.kernel.org/r/20220215093203.31032-1-sujiaxun@uniontech.com Signed-off-by: sujiaxun Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/oom.h | 4 ---- kernel/sysctl.c | 23 ----------------------- mm/oom_kill.c | 38 +++++++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 30 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..02d1e7bbd8cd 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -123,8 +123,4 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -/* sysctls */ -extern int sysctl_oom_dump_tasks; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_panic_on_oom; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5e43569ce2be..a21c0ea396f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2241,29 +2241,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..7cc338a9e9e4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -52,9 +52,38 @@ #define CREATE_TRACE_POINTS #include -int sysctl_panic_on_oom; -int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks = 1; +static int sysctl_panic_on_oom; +static int sysctl_oom_kill_allocating_task; +static int sysctl_oom_dump_tasks = 1; + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_oom_kill_table[] = { + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -677,6 +706,9 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_oom_kill_table); +#endif return 0; } subsys_initcall(oom_init) -- cgit v1.2.3 From aa779e5102195e1d9ade95dcbc0bfbd8f916eb59 Mon Sep 17 00:00:00 2001 From: zhanglianjie Date: Thu, 17 Feb 2022 18:51:51 -0800 Subject: mm: move page-writeback sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the page-writeback sysctls to its own file. [akpm@linux-foundation.org: coding-style cleanups] akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warnings] Link: https://lkml.kernel.org/r/20220129012955.26594-1-zhanglianjie@uniontech.com Signed-off-by: zhanglianjie Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/writeback.h | 15 ------- kernel/sysctl.c | 69 ------------------------------- mm/page-writeback.c | 103 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 94 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec..dc2b94e6a94f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom); extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ -extern int dirty_background_ratio; -extern unsigned long dirty_background_bytes; -extern int vm_dirty_ratio; -extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; -extern int vm_highmem_is_dirtyable; extern int laptop_mode; -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a21c0ea396f3..36bfe1c92d44 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,8 +100,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2263,55 +2261,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, @@ -2483,13 +2432,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, @@ -2587,17 +2529,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e2da284e427..438762173a59 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,30 +70,33 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -491,7 +494,8 @@ bool node_dirty_ok(struct pglist_data *pgdat) return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -502,7 +506,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, +static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -513,7 +517,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, +static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; @@ -527,7 +531,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, +static int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; @@ -540,6 +544,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -1981,10 +1986,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } +#ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, +static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; @@ -2005,6 +2011,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } +#endif void laptop_mode_timer_fn(struct timer_list *t) { @@ -2069,6 +2076,79 @@ static int page_writeback_cpu_online(unsigned int cpu) return 0; } +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + /* * Called early on to tune the page writeback dirty limits. * @@ -2093,6 +2173,9 @@ void __init page_writeback_init(void) page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** -- cgit v1.2.3 From f79c9b8ae8bde10126586c1bb55b5fd027276d8e Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:58:57 +0800 Subject: kernel/lockdep: move lockdep sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the prove_locking and lock_stat sysctls to its own file, kernel/lockdep.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/lockdep.h | 4 ---- kernel/locking/lockdep.c | 35 +++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 21 --------------------- 3 files changed, 33 insertions(+), 27 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..37951c17908e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -16,10 +16,6 @@ struct task_struct; -/* for sysctl */ -extern int prove_locking; -extern int lock_stat; - #ifdef CONFIG_LOCKDEP #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c06cab6546ed..a4382ae1be59 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -64,19 +64,50 @@ #include #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ + { } +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 36bfe1c92d44..95380d250c8c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -88,9 +88,6 @@ #ifdef CONFIG_RT_MUTEXES #include #endif -#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) -#include -#endif #if defined(CONFIG_SYSCTL) @@ -1679,24 +1676,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic", .data = &panic_timeout, -- cgit v1.2.3 From 9df918698408fd914493aba0b7858fef50eba63a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:12 +0800 Subject: kernel/panic: move panic sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the oops_all_cpu_backtrace sysctl to its own file, kernel/panic.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/panic.h | 6 ------ kernel/panic.c | 26 +++++++++++++++++++++++++- kernel/sysctl.c | 11 ----------- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/panic.h b/include/linux/panic.h index f5844908a089..e71161da69c4 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -15,12 +15,6 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); -#ifdef CONFIG_SMP -extern unsigned int sysctl_oops_all_cpu_backtrace; -#else -#define sysctl_oops_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; diff --git a/kernel/panic.c b/kernel/panic.c index eb4dfb932c85..eb3f2fe4f6d7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -43,7 +43,9 @@ * Should we dump all CPUs backtraces in an oops event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; @@ -73,6 +75,28 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +static struct ctl_table kern_panic_table[] = { + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_panic_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_panic_table); + return 0; +} +late_initcall(kernel_panic_sysctls_init); +#endif + static long no_blink(int state) { return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 95380d250c8c..90fc2212b536 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1922,17 +1922,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SMP - { - .procname = "oops_all_cpu_backtrace", - .data = &sysctl_oops_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, -- cgit v1.2.3 From 801b501439d1b366d524dee4fc1e6b3473a95b9a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:23 +0800 Subject: kernel/acct: move acct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the acct sysctl to its own file, kernel/acct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/acct.h | 1 - kernel/acct.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 12 ------------ 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/acct.h b/include/linux/acct.h index bc70e81895c0..2718c4854815 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -21,7 +21,6 @@ #ifdef CONFIG_BSD_PROCESS_ACCT struct pid_namespace; -extern int acct_parm[]; /* for sysctl */ extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); diff --git a/kernel/acct.c b/kernel/acct.c index 3df53cf1dcd5..13706356ec54 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -70,11 +70,31 @@ * Turned into sysctl-controllable parameters. AV, 12/11/98 */ -int acct_parm[3] = {4, 2, 30}; +static int acct_parm[3] = {4, 2, 30}; #define RESUME (acct_parm[0]) /* >foo% free space - resume */ #define SUSPEND (acct_parm[1]) /* #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -#include -#endif #ifdef CONFIG_RT_MUTEXES #include #endif @@ -1856,15 +1853,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .procname = "sysrq", -- cgit v1.2.3 From 1186618a6a35d43a865448472a261184b608d13c Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:36 +0800 Subject: kernel/delayacct: move delayacct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the delayacct sysctl to its own file, kernel/delayacct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/delayacct.h | 3 --- kernel/delayacct.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 12 ------------ 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 3e03d010bd2e..6b16a6930a19 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,9 +61,6 @@ extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); -extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index c5e8cea9e05f..2c1e18f7c5cf 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,7 +44,7 @@ void delayacct_init(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, +static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int state = delayacct_on; @@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, set_delayacct(state); return err; } + +static struct ctl_table kern_delayacct_table[] = { + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_delayacct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_delayacct_table); + return 0; +} +late_initcall(kernel_delayacct_sysctls_init); #endif void __delayacct_tsk_init(struct task_struct *tsk) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5421e28dbb25..9b74ba12a711 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,7 +67,6 @@ #include #include #include -#include #include "../lib/kstrtox.h" @@ -1651,17 +1650,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { -#ifdef CONFIG_TASK_DELAY_ACCT - { - .procname = "task_delayacct", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_delayacct, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", -- cgit v1.2.3 From d772cc2c321900f3f463a124eebeb7218e66dda6 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:49 +0800 Subject: kernel/do_mount_initrd: move real_root_dev sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the real_root_dev sysctl to its own file, kernel/do_mount_initrd.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/initrd.h | 2 -- init/do_mounts_initrd.c | 22 +++++++++++++++++++++- kernel/sysctl.c | 9 --------- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 1bbe9af48dc3..f1a1f4c92ded 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -29,8 +29,6 @@ static inline void wait_for_initramfs(void) {} extern phys_addr_t phys_initrd_start; extern unsigned long phys_initrd_size; -extern unsigned int real_root_dev; - extern char __initramfs_start[]; extern unsigned long __initramfs_size; diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 533d81ed74d4..327962ea354c 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -14,12 +14,32 @@ unsigned long initrd_start, initrd_end; int initrd_below_start_ok; -unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ static int __initdata mount_initrd = 1; phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_do_mounts_initrd_table[] = { + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static __init int kernel_do_mounts_initrd_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_do_mounts_initrd_table); + return 0; +} +late_initcall(kernel_do_mounts_initrd_sysctls_init); +#endif /* CONFIG_SYSCTL */ + static int __init no_initrd(char *str) { mount_initrd = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9b74ba12a711..10a551f8fcab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1693,15 +1693,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_latencytop, }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "print-fatal-signals", -- cgit v1.2.3 From 8e4e83b2278bdfb55cb2b13de07cf0a721ce8af7 Mon Sep 17 00:00:00 2001 From: Wei Xiao Date: Wed, 23 Feb 2022 19:11:53 +0800 Subject: ftrace: move sysctl_ftrace_enabled to ftrace.c This moves ftrace_enabled to trace/ftrace.c. We move sysctls to places where features actually belong to improve the readability of the code and reduce the risk of code merge conflicts. At the same time, the proc-sysctl maintainers do not want to know what sysctl knobs you wish to add for your owner piece of code, we just care about the core logic. Signed-off-by: Wei Xiao Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- include/linux/ftrace.h | 3 --- kernel/sysctl.c | 9 --------- kernel/trace/ftrace.c | 22 +++++++++++++++++++++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..088b915853dd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -101,9 +101,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; -extern int -ftrace_enable_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 10a551f8fcab..21172d3dad6e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1751,15 +1751,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif #ifdef CONFIG_STACK_TRACER { .procname = "stack_tracer_enabled", diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..a5efbbc289b4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7921,7 +7921,8 @@ static bool is_permanent_ops_registered(void) return false; } -int +#ifdef CONFIG_SYSCTL +static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -7964,3 +7965,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif -- cgit v1.2.3 From efaa0227f6c6a5073951b20cf2f8c63c4155306c Mon Sep 17 00:00:00 2001 From: tangmeng Date: Tue, 15 Feb 2022 14:50:19 +0800 Subject: timers: Move timer sysctl into the timer code This is part of the effort to reduce kernel/sysctl.c to only contain the core logic. Signed-off-by: tangmeng Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220215065019.7520-1-tangmeng@uniontech.com --- include/linux/timer.h | 8 -------- kernel/sysctl.c | 11 ----------- kernel/time/timer.c | 53 ++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 38 insertions(+), 34 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/timer.h b/include/linux/timer.h index fda13c9d1256..648f00105f58 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -196,14 +196,6 @@ extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -struct ctl_table; - -extern unsigned int sysctl_timer_migration; -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..5b7b1a82ae6a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2288,17 +2288,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_BPF_SYSCALL { .procname = "unprivileged_bpf_disabled", diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 60aebf2b7f0a..ef082d43c307 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP -unsigned int sysctl_timer_migration = 1; +static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); @@ -234,7 +235,42 @@ static void timers_update_migration(void) else static_branch_disable(&timers_migration_enabled); } -#else + +#ifdef CONFIG_SYSCTL +static int timer_migration_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static struct ctl_table timer_sysctl[] = { + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init timer_sysctl_init(void) +{ + register_sysctl("kernel", timer_sysctl); + return 0; +} +device_initcall(timer_sysctl_init); +#endif /* CONFIG_SYSCTL */ +#else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ @@ -251,19 +287,6 @@ void timers_update_nohz(void) schedule_work(&timer_update_work); } -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - mutex_lock(&timer_keys_mutex); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!ret && write) - timers_update_migration(); - mutex_unlock(&timer_keys_mutex); - return ret; -} - static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); -- cgit v1.2.3 From 2900005ea287b11dcc8c1b9fcf24893b7ff41d6d Mon Sep 17 00:00:00 2001 From: Yan Zhu Date: Thu, 7 Apr 2022 15:07:59 +0800 Subject: bpf: Move BPF sysctls from kernel/sysctl.c to BPF core We're moving sysctls out of kernel/sysctl.c as it is a mess. We already moved all filesystem sysctls out. And with time the goal is to move all sysctls out to their own subsystem/actual user. kernel/sysctl.c has grown to an insane mess and its easy to run into conflicts with it. The effort to move them out into various subsystems is part of this. Signed-off-by: Yan Zhu Signed-off-by: Daniel Borkmann Cc: Luis Chamberlain Link: https://lore.kernel.org/bpf/20220407070759.29506-1-zhuyan34@huawei.com --- kernel/bpf/syscall.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 79 ----------------------------------------------- 2 files changed, 87 insertions(+), 79 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cdaa1152436a..e9621cfa09f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4908,3 +4908,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = { const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; + +#ifdef CONFIG_SYSCTL +static int bpf_stats_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} + +void __weak unpriv_ebpf_notify(int new_state) +{ +} + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + + unpriv_ebpf_notify(unpriv_enable); + + return ret; +} + +static struct ctl_table bpf_syscall_table[] = { + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = bpf_stats_handler, + }, + { } +}; + +static int __init bpf_syscall_sysctl_init(void) +{ + register_sysctl_init("kernel", bpf_syscall_table); + return 0; +} +late_initcall(bpf_syscall_sysctl_init); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..47139877f62d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,7 +62,6 @@ #include #include #include -#include #include #include #include @@ -148,66 +147,6 @@ static const int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int bpf_stats_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static int saved_val; - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&bpf_stats_enabled_mutex); - val = saved_val; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret && val != saved_val) { - if (val) - static_key_slow_inc(key); - else - static_key_slow_dec(key); - saved_val = val; - } - mutex_unlock(&bpf_stats_enabled_mutex); - return ret; -} - -void __weak unpriv_ebpf_notify(int new_state) -{ -} - -static int bpf_unpriv_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret, unpriv_enable = *(int *)table->data; - bool locked_state = unpriv_enable == 1; - struct ctl_table tmp = *table; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - tmp.data = &unpriv_enable; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (locked_state && unpriv_enable != 1) - return -EPERM; - *(int *)table->data = unpriv_enable; - } - - unpriv_ebpf_notify(unpriv_enable); - - return ret; -} -#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ - /* * /proc/sys support */ @@ -2299,24 +2238,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - .proc_handler = bpf_unpriv_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = bpf_stats_handler, - }, -#endif #if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", -- cgit v1.2.3 From 988f11e046401f8561c4afefa506a50f0203de40 Mon Sep 17 00:00:00 2001 From: liaohua Date: Thu, 7 Apr 2022 15:29:48 +0800 Subject: latencytop: move sysctl to its own file This moves latencytop sysctl to kernel/latencytop.c Signed-off-by: liaohua Signed-off-by: Luis Chamberlain --- include/linux/latencytop.h | 3 --- kernel/latencytop.c | 41 +++++++++++++++++++++++++++++------------ kernel/sysctl.c | 10 ---------- 3 files changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index abe3d95f795b..84f1053cf2a8 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,9 +38,6 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - #else static inline void diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 166d7bf49666..76166df011a4 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -55,6 +55,7 @@ #include #include #include +#include static DEFINE_RAW_SPINLOCK(latency_lock); @@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; +#ifdef CONFIG_SYSCTL +static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} + +static struct ctl_table latencytop_sysctl[] = { + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, + {} +}; +#endif + void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; @@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = { static int __init init_lstats_procfs(void) { proc_create("latency_stats", 0644, NULL, &lstats_proc_ops); +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", latencytop_sysctl); +#endif return 0; } - -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - int err; - - err = proc_dointvec(table, write, buffer, lenp, ppos); - if (latencytop_enabled) - force_schedstat_enabled(); - - return err; -} device_initcall(init_lstats_procfs); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0fdf465a93d..b60345cbadf0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include "../lib/kstrtox.h" @@ -1623,15 +1622,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, #endif { .procname = "print-fatal-signals", -- cgit v1.2.3 From a467257ffe4bdb13eacddec0137013f6a1140b81 Mon Sep 17 00:00:00 2001 From: yingelin Date: Sun, 24 Apr 2022 10:57:40 +0800 Subject: kernel/kexec_core: move kexec_core sysctls into its own file This move the kernel/kexec_core.c respective sysctls to its own file. kernel/sysctl.c has grown to an insane mess, We move sysctls to places where features actually belong to improve the readability and reduce merge conflicts. At the same time, the proc-sysctl maintainers can easily care about the core logic other than the sysctl knobs added for some feature. We already moved all filesystem sysctls out. This patch is part of the effort to move kexec related sysctls out. Signed-off-by: yingelin Acked-by: Baoquan He Signed-off-by: Luis Chamberlain --- kernel/kexec_core.c | 22 ++++++++++++++++++++++ kernel/sysctl.c | 13 ------------- 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..a0456baf52cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -936,6 +936,28 @@ int kimage_load_segment(struct kimage *image, struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; +#ifdef CONFIG_SYSCTL +static struct ctl_table kexec_core_sysctls[] = { + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init kexec_core_sysctl_init(void) +{ + register_sysctl_init("kernel", kexec_core_sysctls); + return 0; +} +late_initcall(kexec_core_sysctl_init); +#endif /* * No panic_cpu check version of crash_kexec(). This function is called diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b60345cbadf0..0f3cb61a2e39 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include @@ -1712,18 +1711,6 @@ static struct ctl_table kern_table[] = { .proc_handler = tracepoint_printk_sysctl, }, #endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", -- cgit v1.2.3 From c381d02b2fd5f82d2207db1b9b25ff60d0d9c27c Mon Sep 17 00:00:00 2001 From: Yuwei Wang Date: Wed, 29 Jun 2022 08:48:31 +0000 Subject: sysctl: add proc_dointvec_ms_jiffies_minmax add proc_dointvec_ms_jiffies_minmax to fit read msecs value to jiffies with a limited range of values Signed-off-by: Yuwei Wang Signed-off-by: Paolo Abeni --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80263f7cdb77..17b42ce89d3e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -75,6 +75,8 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..85c92e2c2570 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1237,6 +1237,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + return 0; +} + /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table @@ -1259,6 +1283,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, do_proc_dointvec_jiffies_conv,NULL); } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); +} + /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table @@ -1523,6 +1558,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 1f1be04b4d48a2475ea1aab46a99221bfc5c0968 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:52 -0700 Subject: sysctl: Fix data races in proc_dointvec(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..c8a05655ae60 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -446,14 +446,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; - *valp = -*lvalp; + WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } } else { - int val = *valp; + int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; -- cgit v1.2.3 From 4762b532ec9539755aab61445d5da6e1926ccb99 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:53 -0700 Subject: sysctl: Fix data races in proc_douintvec(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_douintvec() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_douintvec() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8a05655ae60..2ab8c2a37e8f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -472,9 +472,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, if (write) { if (*lvalp > UINT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } else { - unsigned int val = *valp; + unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; -- cgit v1.2.3 From f613d86d014b6375a4085901de39406598121e35 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:54 -0700 Subject: sysctl: Fix data races in proc_dointvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ab8c2a37e8f..4d87832367f2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -857,7 +857,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; -- cgit v1.2.3 From 2d3b559df3ed39258737789aae2ae7973d205bc1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:55 -0700 Subject: sysctl: Fix data races in proc_douintvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_douintvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_douintvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 61d9b56a8920 ("sysctl: add unsigned int range support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d87832367f2..379721a03d41 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -923,7 +923,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, (param->max && *param->max < tmp)) return -ERANGE; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; -- cgit v1.2.3 From c31bcc8fb89fc2812663900589c6325ba35d9a65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:56 -0700 Subject: sysctl: Fix data races in proc_doulongvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_doulongvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_doulongvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 379721a03d41..8c55ba01f41b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1090,9 +1090,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = -EINVAL; break; } - *i = val; + WRITE_ONCE(*i, val); } else { - val = convdiv * (*i) / convmul; + val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); -- cgit v1.2.3 From e877820877663fbae8cb9582ea597a7230b94df3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:57 -0700 Subject: sysctl: Fix data races in proc_dointvec_jiffies(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_jiffies() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_jiffies() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c55ba01f41b..bf9383d17e1b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1173,9 +1173,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, if (write) { if (*lvalp > INT_MAX / HZ) return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + if (*negp) + WRITE_ONCE(*valp, -*lvalp * HZ); + else + WRITE_ONCE(*valp, *lvalp * HZ); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; -- cgit v1.2.3 From 7dee5d7747a69aa2be41f04c6a7ecfe3ac8cdf18 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Jul 2022 17:15:19 -0700 Subject: sysctl: Fix data-races in proc_dou8vec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dou8vec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dou8vec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: cb9444130662 ("sysctl: add proc_dou8vec_minmax()") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bf9383d17e1b..b016d68da08a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1007,13 +1007,13 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, tmp.maxlen = sizeof(val); tmp.data = &val; - val = *data; + val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, ¶m); if (res) return res; if (write) - *data = val; + WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -- cgit v1.2.3 From 7d1025e559782b58824b36cb8ad547a69f2e4b31 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Jul 2022 17:15:20 -0700 Subject: sysctl: Fix data-races in proc_dointvec_ms_jiffies(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_ms_jiffies() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_ms_jiffies() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b016d68da08a..d99bc3945445 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1224,9 +1224,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, if (jif > INT_MAX) return 1; - *valp = (int)jif; + WRITE_ONCE(*valp, (int)jif); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1294,8 +1294,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. -- cgit v1.2.3 From 43b5240ca6b33108998810593248186b1e3ae34a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 9 Jun 2022 18:40:32 +0800 Subject: mm: sysctl: fix missing numa_stat when !CONFIG_HUGETLB_PAGE "numa_stat" should not be included in the scope of CONFIG_HUGETLB_PAGE, if CONFIG_HUGETLB_PAGE is not configured even if CONFIG_NUMA is configured, "numa_stat" is missed form /proc. Move it out of CONFIG_HUGETLB_PAGE to fix it. Fixes: 4518085e127d ("mm, sysctl: make NUMA stats configurable") Signed-off-by: Muchun Song Cc: Acked-by: Michal Hocko Acked-by: Mel Gorman Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..aaf0b1f1dc57 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2091,6 +2091,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", @@ -2107,15 +2118,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { .procname = "hugetlb_shm_group", -- cgit v1.2.3 From 7251ceb51af972603552fcea2db316ed2b9d95ba Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 17 May 2022 17:07:31 +0200 Subject: sysctl: Merge adjacent CONFIG_TREE_RCU blocks There are two adjacent sysctl entries protected by the same CONFIG_TREE_RCU config symbol. Merge them into a single block to improve readability. Use the more common "#ifdef" form while at it. Signed-off-by: Geert Uytterhoeven Reviewed-by: Paul E. McKenney Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b233714a1c78..8a0e85a95138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2061,7 +2061,7 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, @@ -2071,8 +2071,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#endif -#if defined(CONFIG_TREE_RCU) { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic, -- cgit v1.2.3 From 5bfd5d3e2ec883a3db3414a42d94d23961a790ed Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Sun, 22 May 2022 13:29:33 +0800 Subject: kernel/sysctl.c: Clean up indentation, replace spaces with tab. This patch fixes two coding style issues: 1. Clean up indentation, replace spaces with tab 2. Add space after ',' Signed-off-by: Fanjun Kong Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a0e85a95138..223376959d29 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1315,8 +1315,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_userhz_jiffies_conv, NULL); } /** -- cgit v1.2.3 From 374a723c7448bbea22846884ba336ed83b085aab Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Mon, 16 May 2022 17:07:53 +0800 Subject: kernel/sysctl.c: Remove trailing white space This patch removes the trailing white space in kernel/sysysctl.c Signed-off-by: Fanjun Kong Reviewed-by: Muchun Song [mcgrof: fix commit message subject] Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 223376959d29..205d605cacc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -492,12 +492,12 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int *i, vleft, first = 1, err = 0; size_t left; char *p; - + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -729,7 +729,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer, * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ @@ -1273,7 +1273,7 @@ static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lv * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * @@ -1306,8 +1306,8 @@ int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. -- cgit v1.2.3 From 8ebc4123c1445ef11a9989d9bc676691a1d43302 Mon Sep 17 00:00:00 2001 From: Dong Chuanjian Date: Mon, 22 Aug 2022 14:30:49 +0800 Subject: kernel/sysctl.c: remove unnecessary (void*) conversions remove unnecessary void* type casting Signed-off-by: Dong Chuanjian Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..324e6bbbeb34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1052,9 +1052,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, return 0; } - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; + i = data; + min = table->extra1; + max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; -- cgit v1.2.3 From feb2bd010aec77a1cf981d2649183c64cd4870a0 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 5 Sep 2022 20:47:24 +0800 Subject: sysctl: remove max_extfrag_threshold Remove max_extfrag_threshold and replace by SYSCTL_ONE_THOUSAND. No functional change. Signed-off-by: Liu Shixin Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 324e6bbbeb34..368909f86469 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -129,11 +129,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; int sysctl_legacy_va_layout; #endif -#ifdef CONFIG_COMPACTION -/* min_extfrag_threshold is SYSCTL_ZERO */; -static const int max_extfrag_threshold = 1000; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -2216,7 +2211,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&max_extfrag_threshold, + .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "compact_unevictable_allowed", -- cgit v1.2.3 From b13bc7cbb931727b1b0a63594cd734bfd979e985 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 8 Sep 2022 16:29:46 +0800 Subject: kernel/sysctl.c: move sysctl_vals and sysctl_long_vals to sysctl.c sysctl_vals and sysctl_long_vals are declared even if sysctl is disabled. Move its definition to sysctl.c to make sure their integrity in any case. Signed-off-by: Liu Shixin Signed-off-by: Luis Chamberlain --- fs/proc/proc_sysctl.c | 7 ------- kernel/sysctl.c | 9 ++++++++- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 50ba9e4fb284..48f2d60bd78a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; -EXPORT_SYMBOL(sysctl_vals); - -const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; -EXPORT_SYMBOL_GPL(sysctl_long_vals); - /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 368909f86469..82ab288758f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,9 +82,16 @@ #include #endif +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; +EXPORT_SYMBOL(sysctl_vals); + +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + #if defined(CONFIG_SYSCTL) -/* Constants used for minimum and maximum */ +/* Constants used for minimum and maximum */ #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; -- cgit v1.2.3 From c6833e10008f976a173dd5abdf992e492cbc3bcf Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2022 16:39:52 +0800 Subject: memory tiering: rate limit NUMA migration throughput In NUMA balancing memory tiering mode, if there are hot pages in slow memory node and cold pages in fast memory node, we need to promote/demote hot/cold pages between the fast and cold memory nodes. A choice is to promote/demote as fast as possible. But the CPU cycles and memory bandwidth consumed by the high promoting/demoting throughput will hurt the latency of some workload because of accessing inflating and slow memory bandwidth contention. A way to resolve this issue is to restrict the max promoting/demoting throughput. It will take longer to finish the promoting/demoting. But the workload latency will be better. This is implemented in this patch as the page promotion rate limit mechanism. The number of the candidate pages to be promoted to the fast memory node via NUMA balancing is counted, if the count exceeds the limit specified by the users, the NUMA balancing promotion will be stopped until the next second. A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added for the users to specify the limit. Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: osalvador Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Wei Xu Cc: Yang Shi Cc: Zhong Jiang Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 11 ++++++++++ include/linux/mmzone.h | 7 ++++++ include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 33 +++++++++++++++++++++++++++-- kernel/sysctl.c | 8 +++++++ mm/vmstat.c | 1 + 6 files changed, 59 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ee6572b1edad..835c8844bba4 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -635,6 +635,17 @@ different types of memory (represented as different NUMA nodes) to place the hot pages in the fast memory. This is implemented based on unmapping and page fault too. +numa_balancing_promote_rate_limit_MBps +====================================== + +Too high promotion/demotion throughput between different memory types +may hurt application latency. This can be used to rate limit the +promotion throughput. The per-node max promotion throughput in MB/s +will be limited to be no more than the set value. + +A rule of thumb is to set this to less than 1/10 of the PMEM node +write bandwidth. + oops_all_cpu_backtrace ====================== diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f571dc7c524..a0003eaa751f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,7 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ + PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; @@ -998,6 +999,12 @@ typedef struct pglist_data { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_NUMA_BALANCING + /* start time in ms of current promote rate limit period */ + unsigned int nbp_rl_start; + /* number of promote candidate pages at start time of current rate limit period */ + unsigned long nbp_rl_nr_cand; +#endif /* Fields commonly accessed by the page reclaim scanner */ /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e650946816d0..303ee7dd0c7e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -27,6 +27,7 @@ enum sched_tunable_scaling { #ifdef CONFIG_NUMA_BALANCING extern int sysctl_numa_balancing_mode; +extern unsigned int sysctl_numa_balancing_promote_rate_limit; #else #define sysctl_numa_balancing_mode 0 #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06db566c7660..1d1dd88daaab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; /* The page with hint page fault latency < threshold in ms is considered hot */ unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struct page *page) return (time - last_time) & PAGE_ACCESS_TIME_MASK; } +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !node_is_toptier(src_nid)) { struct pglist_data *pgdat; - unsigned long latency, th; + unsigned long rate_limit, latency, th; pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) @@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (latency >= th) return false; - return true; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..f10a610aa834 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #endif /* CONFIG_NUMA_BALANCING */ { .procname = "panic", diff --git a/mm/vmstat.c b/mm/vmstat.c index 90af9a8572f5..c109167a669c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = { #endif #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", + "pgpromote_candidate", #endif /* enum writeback_stat_item counters */ -- cgit v1.2.3 From e6cfaf34be9fcd1a8285a294e18986bfc41a409c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 5 Dec 2022 11:33:40 -0800 Subject: proc: avoid integer type confusion in get_proc_long proc_get_long() is passed a size_t, but then assigns it to an 'int' variable for the length. Let's not do that, even if our IO paths are limited to MAX_RW_COUNT (exactly because of these kinds of type errors). So do the proper test in the rigth type. Reported-by: Kyle Zeng Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 188c305aeb8b..8898ddeaaf75 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -342,13 +342,12 @@ static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, const char *perm_tr, unsigned perm_tr_len, char *tr) { - int len; char *p, tmp[TMPBUFLEN]; + ssize_t len = *size; - if (!*size) + if (len <= 0) return -EINVAL; - len = *size; if (len > TMPBUFLEN - 1) len = TMPBUFLEN - 1; -- cgit v1.2.3 From bce9332220bd677d83b19d21502776ad555a0e73 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 5 Dec 2022 12:09:06 -0800 Subject: proc: proc_skip_spaces() shouldn't think it is working on C strings proc_skip_spaces() seems to think it is working on C strings, and ends up being just a wrapper around skip_spaces() with a really odd calling convention. Instead of basing it on skip_spaces(), it should have looked more like proc_skip_char(), which really is the exact same function (except it skips a particular character, rather than whitespace). So use that as inspiration, odd coding and all. Now the calling convention actually makes sense and works for the intended purpose. Reported-and-tested-by: Kyle Zeng Acked-by: Eric Dumazet Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8898ddeaaf75..c6d9dec11b74 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -267,13 +267,14 @@ int proc_dostring(struct ctl_table *table, int write, ppos); } -static size_t proc_skip_spaces(char **buf) +static void proc_skip_spaces(char **buf, size_t *size) { - size_t ret; - char *tmp = skip_spaces(*buf); - ret = tmp - *buf; - *buf = tmp; - return ret; + while (*size) { + if (!isspace(**buf)) + break; + (*size)--; + (*buf)++; + } } static void proc_skip_char(char **buf, size_t *size, const char v) @@ -520,7 +521,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) break; @@ -547,7 +548,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; @@ -589,7 +590,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) { err = -EINVAL; goto out_free; @@ -609,7 +610,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, } if (!err && left) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); out_free: if (err) @@ -1074,7 +1075,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, if (write) { bool neg; - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (!left) break; @@ -1103,7 +1104,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&p); + proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; -- cgit v1.2.3