From 1a38416cea8ac801ae8f261074721f35317613dc Mon Sep 17 00:00:00 2001 From: David Shaohua Li Date: Wed, 23 Nov 2005 12:36:00 -0500 Subject: [ACPI] SMP S3 resume: evaluate _WAK after INIT On SMP resume from S3, we reset (INIT) the non-boot processors to boot them cleanly. But the BIOS needs to execute _WAK after INIT in order to properly initialized these processors upon resume. http://bugzilla.kernel.org/show_bug.cgi?id=5651 Signed-off-by: David Shaohua Li Signed-off-by: Len Brown --- kernel/power/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d253f3ae2fa5..9cb235cba4a9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -133,10 +133,10 @@ static int suspend_enter(suspend_state_t state) static void suspend_finish(suspend_state_t state) { device_resume(); - if (pm_ops && pm_ops->finish) - pm_ops->finish(state); thaw_processes(); enable_nonboot_cpus(); + if (pm_ops && pm_ops->finish) + pm_ops->finish(state); pm_restore_console(); } -- cgit v1.2.3 From 729b4d4ce1982c52040bbf22d6711cdf8db07ad8 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Thu, 1 Dec 2005 04:29:00 -0500 Subject: [ACPI] fix reboot upon suspend-to-disk http://bugzilla.kernel.org/show_bug.cgi?id=4320 Signed-off-by: Alexey Starikovskiy Acked-by: Pavel Machek Signed-off-by: Len Brown --- drivers/acpi/sleep/poweroff.c | 15 +++++++++------ drivers/acpi/sleep/sleep.h | 2 +- drivers/acpi/sleep/wakeup.c | 6 +++--- include/linux/kernel.h | 1 + include/linux/reboot.h | 3 +-- kernel/power/disk.c | 9 +-------- kernel/sys.c | 25 ++++++++++--------------- 7 files changed, 26 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c index af7935a95bcc..47fb4b394eec 100644 --- a/drivers/acpi/sleep/poweroff.c +++ b/drivers/acpi/sleep/poweroff.c @@ -33,9 +33,7 @@ int acpi_sleep_prepare(u32 acpi_state) ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); #endif - if (acpi_state == ACPI_STATE_S5) { - acpi_wakeup_gpe_poweroff_prepare(); - } + acpi_gpe_sleep_prepare(acpi_state); acpi_enter_sleep_state_prep(acpi_state); return 0; } @@ -53,11 +51,16 @@ void acpi_power_off(void) static int acpi_shutdown(struct sys_device *x) { - if (system_state == SYSTEM_POWER_OFF) { - /* Prepare if we are going to power off the system */ + switch (system_state) { + case SYSTEM_POWER_OFF: + /* Prepare to power off the system */ return acpi_sleep_prepare(ACPI_STATE_S5); + case SYSTEM_SUSPEND_DISK: + /* Prepare to suspend the system to disk */ + return acpi_sleep_prepare(ACPI_STATE_S4); + default: + return 0; } - return 0; } static struct sysdev_class acpi_sysclass = { diff --git a/drivers/acpi/sleep/sleep.h b/drivers/acpi/sleep/sleep.h index efd0001c6f05..f3e70397a7d6 100644 --- a/drivers/acpi/sleep/sleep.h +++ b/drivers/acpi/sleep/sleep.h @@ -5,4 +5,4 @@ extern int acpi_suspend (u32 state); extern void acpi_enable_wakeup_device_prep(u8 sleep_state); extern void acpi_enable_wakeup_device(u8 sleep_state); extern void acpi_disable_wakeup_device(u8 sleep_state); -extern void acpi_wakeup_gpe_poweroff_prepare(void); +extern void acpi_gpe_sleep_prepare(u32 sleep_state); diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c index 4134ed43d026..85df0ceda2a9 100644 --- a/drivers/acpi/sleep/wakeup.c +++ b/drivers/acpi/sleep/wakeup.c @@ -192,7 +192,7 @@ late_initcall(acpi_wakeup_device_init); * RUNTIME GPEs, we simply mark all GPES that * are not enabled for wakeup from S5 as RUNTIME. */ -void acpi_wakeup_gpe_poweroff_prepare(void) +void acpi_gpe_sleep_prepare(u32 sleep_state) { struct list_head *node, *next; @@ -201,8 +201,8 @@ void acpi_wakeup_gpe_poweroff_prepare(void) struct acpi_device, wakeup_list); - /* The GPE can wakeup system from S5, don't touch it */ - if ((u32) dev->wakeup.sleep_state == ACPI_STATE_S5) + /* The GPE can wakeup system from this state, don't touch it */ + if ((u32) dev->wakeup.sleep_state >= sleep_state) continue; /* acpi_set_gpe_type will automatically disable GPE */ acpi_set_gpe_type(dev->wakeup.gpe_device, diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b1e407a4fbda..73aa55a73334 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -181,6 +181,7 @@ extern enum system_states { SYSTEM_HALT, SYSTEM_POWER_OFF, SYSTEM_RESTART, + SYSTEM_SUSPEND_DISK, } system_state; #define TAINT_PROPRIETARY_MODULE (1<<0) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 7ab2cdb83ef0..015297ff73fa 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -60,8 +60,7 @@ extern void machine_crash_shutdown(struct pt_regs *); */ extern void kernel_restart_prepare(char *cmd); -extern void kernel_halt_prepare(void); -extern void kernel_power_off_prepare(void); +extern void kernel_shutdown_prepare(enum system_states state); extern void kernel_restart(char *cmd); extern void kernel_halt(void); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a564f4..f2cd279d07c7 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -52,7 +52,7 @@ static void power_down(suspend_disk_method_t mode) switch(mode) { case PM_DISK_PLATFORM: - kernel_power_off_prepare(); + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: @@ -119,13 +119,6 @@ static int prepare_processes(void) goto thaw; } - if (pm_disk_mode == PM_DISK_PLATFORM) { - if (pm_ops && pm_ops->prepare) { - if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) - goto thaw; - } - } - /* Free memory before shutting down devices. */ free_some_memory(); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index eecf84526afe..c3b1874661fa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -427,23 +427,25 @@ void kernel_kexec(void) } EXPORT_SYMBOL_GPL(kernel_kexec); +void kernel_shutdown_prepare(enum system_states state) +{ + notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + device_shutdown(); +} /** * kernel_halt - halt the system * * Shutdown everything and perform a clean system halt. */ -void kernel_halt_prepare(void) -{ - notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_state = SYSTEM_HALT; - device_shutdown(); -} void kernel_halt(void) { - kernel_halt_prepare(); + kernel_shutdown_prepare(SYSTEM_HALT); printk(KERN_EMERG "System halted.\n"); machine_halt(); } + EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -451,20 +453,13 @@ EXPORT_SYMBOL_GPL(kernel_halt); * * Shutdown everything and perform a clean system power_off. */ -void kernel_power_off_prepare(void) -{ - notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_state = SYSTEM_POWER_OFF; - device_shutdown(); -} void kernel_power_off(void) { - kernel_power_off_prepare(); + kernel_shutdown_prepare(SYSTEM_POWER_OFF); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); - /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers -- cgit v1.2.3 From 3fa97c9db4f6f93f41f7a40d08872dbfd8dc907e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jan 2006 16:34:26 -0800 Subject: [PATCH] "Fix uidhash_lock <-> RXU deadlock" fix I get storms of warnings from local_bh_enable(). Better-tested patches, please. Cc: Ingo Molnar Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index d1ae2349347e..d9deae43a9ab 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -33,6 +33,10 @@ static struct list_head uidhash_table[UIDHASH_SZ]; * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. + * But free_uid() is also called with local interrupts disabled, and running + * local_bh_enable() with local interrupts disabled is an error - we'll run + * softirq callbacks, and they can unconditionally enable interrupts, and + * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); @@ -89,16 +93,19 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *has struct user_struct *find_user(uid_t uid) { struct user_struct *ret; + unsigned long flags; - spin_lock_bh(&uidhash_lock); + spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); - spin_unlock_bh(&uidhash_lock); + spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); key_put(up->uid_keyring); @@ -106,7 +113,7 @@ void free_uid(struct user_struct *up) kmem_cache_free(uid_cachep, up); spin_unlock(&uidhash_lock); } - local_bh_enable(); + local_irq_restore(flags); } struct user_struct * alloc_uid(uid_t uid) @@ -114,9 +121,9 @@ struct user_struct * alloc_uid(uid_t uid) struct list_head *hashent = uidhashentry(uid); struct user_struct *up; - spin_lock_bh(&uidhash_lock); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); - spin_unlock_bh(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); if (!up) { struct user_struct *new; @@ -146,7 +153,7 @@ struct user_struct * alloc_uid(uid_t uid) * Before adding this, check whether we raced * on adding the same user already.. */ - spin_lock_bh(&uidhash_lock); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { key_put(new->uid_keyring); @@ -156,7 +163,7 @@ struct user_struct * alloc_uid(uid_t uid) uid_hash_insert(new, hashent); up = new; } - spin_unlock_bh(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); } return up; @@ -192,9 +199,9 @@ static int __init uid_cache_init(void) INIT_LIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ - spin_lock_bh(&uidhash_lock); + spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(0)); - spin_unlock_bh(&uidhash_lock); + spin_unlock_irq(&uidhash_lock); return 0; } -- cgit v1.2.3 From 853609b61ef88b414ffd1613741aa59894334320 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Feb 2006 03:05:07 -0800 Subject: [PATCH] swsusp: use bytes as image size units Make swsusp use bytes as the image size units, which is needed for future compatibility. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/power/interface.txt | 2 +- Documentation/power/swsusp.txt | 2 +- kernel/power/disk.c | 6 +++--- kernel/power/power.h | 4 ++-- kernel/power/swsusp.c | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index bd4ffb5bd49a..4117802af0f8 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -44,7 +44,7 @@ it. /sys/power/image_size controls the size of the image created by the suspend-to-disk mechanism. It can be written a string representing a non-negative integer that will be used as an upper -limit of the image size, in megabytes. The suspend-to-disk mechanism will +limit of the image size, in bytes. The suspend-to-disk mechanism will do its best to ensure the image size will not exceed that number. However, if this turns out to be impossible, it will try to suspend anyway using the smallest image possible. In particular, if "0" is written to this file, the diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index 08c79d4dc540..b28b7f04abb8 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -27,7 +27,7 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state echo platform > /sys/power/disk; echo disk > /sys/power/state -If you want to limit the suspend image size to N megabytes, do +If you want to limit the suspend image size to N bytes, do echo N > /sys/power/image_size diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e24446f8d8cd..f2b3b0ea512a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -367,14 +367,14 @@ power_attr(resume); static ssize_t image_size_show(struct subsystem * subsys, char *buf) { - return sprintf(buf, "%u\n", image_size); + return sprintf(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) { - unsigned int size; + unsigned long size; - if (sscanf(buf, "%u", &size) == 1) { + if (sscanf(buf, "%lu", &size) == 1) { image_size = size; return n; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 7e8492fd1423..61beb5e0e927 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -51,8 +51,8 @@ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; -/* Preferred image size in MB (default 500) */ -extern unsigned int image_size; +/* Preferred image size in bytes (default 500 MB) */ +extern unsigned long image_size; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 55a18d26abed..59c91c148e82 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -70,12 +70,12 @@ #include "power.h" /* - * Preferred image size in MB (tunable via /sys/power/image_size). + * Preferred image size in bytes (tunable via /sys/power/image_size). * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N MB, but if that is impossible, it will + * size will not exceed N bytes, but if that is impossible, it will * try to create the smallest image possible. */ -unsigned int image_size = 500; +unsigned long image_size = 500 * 1024 * 1024; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); @@ -590,7 +590,7 @@ int swsusp_shrink_memory(void) if (!tmp) return -ENOMEM; pages += tmp; - } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { + } else if (size > image_size / PAGE_SIZE) { tmp = shrink_all_memory(SHRINK_BITE); pages += tmp; } -- cgit v1.2.3 From bc1978d404befacd272d0321ef749cc3192e488b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Feb 2006 03:05:08 -0800 Subject: [PATCH] hrtimers: fixup itimer conversion The itimer conversion removed the locking which protects the timer and variables in the shared signal structure. Steven Rostedt found the problem in the latest -rt patches. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index c2c05c4ff28d..6433d0685506 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -49,9 +49,11 @@ int do_getitimer(int which, struct itimerval *value) switch (which) { case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: read_lock(&tasklist_lock); @@ -150,8 +152,14 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) switch (which) { case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); timer = &tsk->signal->real_timer; - hrtimer_cancel(timer); + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval @@ -162,6 +170,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) expires = timeval_to_ktime(value->it_value); if (expires.tv64 != 0) hrtimer_start(timer, expires, HRTIMER_REL); + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); -- cgit v1.2.3 From b6557fbca805217588a412f391a65ceafcf1a1af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Feb 2006 03:05:09 -0800 Subject: [PATCH] hrtimers: fix possible use of NULL pointer in posix-timers Fixup the conversion of posix-timers to hrtimers. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 197208b3aa2a..3b606d361b52 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -290,7 +290,8 @@ void do_schedule_next_timer(struct siginfo *info) info->si_overrun = timr->it_overrun_last; } - unlock_timer(timr, flags); + if (timr) + unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr,int si_private) -- cgit v1.2.3 From a16a1c095a2392d49fafea22f3a508e268ef7167 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Feb 2006 03:05:09 -0800 Subject: [PATCH] hrtimers: fix oldvalue return in setitimer This resolves bugzilla bug#5617. The oldvalue of the timer was read after the timer was cancelled, so the remaining time was always zero. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 6433d0685506..379be2f8c84c 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -155,16 +155,16 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) again: spin_lock_irq(&tsk->sighand->siglock); timer = &tsk->signal->real_timer; - /* We are sharing ->siglock with it_real_fn() */ - if (hrtimer_try_to_cancel(timer) < 0) { - spin_unlock_irq(&tsk->sighand->siglock); - goto again; - } if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval = ktime_to_timeval(tsk->signal->it_real_incr); } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } tsk->signal->it_real_incr = timeval_to_ktime(value->it_interval); expires = timeval_to_ktime(value->it_value); -- cgit v1.2.3 From ff60a5dc4fa584d47022d2533bc5c53b80096fb5 Mon Sep 17 00:00:00 2001 From: "akpm@osdl.org" Date: Wed, 1 Feb 2006 03:05:10 -0800 Subject: [PATCH] hrtimers: fix posix-timer requeue race From: Steven Rostedtrostedt@goodmis.org CPU0 expires a posix-timer and runs the callback function. The signal is queued. After releasing the posix-timer lock and before returning to hrtimer_run_queue CPU0 gets interrupted. CPU1 delivers the queued signal and rearms the timer. CPU0 comes back to hrtimer_run_queue and sets the timer state to expired. The next modification of the timer can result in an oops, because the state information is wrong. Keep track of state = RUNNING and check if the state has been in the return path of hrtimer_run_queue. In case the state has been changed, ignore a restart request and do not touch the state variable. Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hrtimer.h | 1 + kernel/hrtimer.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 089bfb1fa01a..c657f3d4924a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -40,6 +40,7 @@ enum hrtimer_restart { enum hrtimer_state { HRTIMER_INACTIVE, /* Timer is inactive */ HRTIMER_EXPIRED, /* Timer is expired */ + HRTIMER_RUNNING, /* Timer is running the callback function */ HRTIMER_PENDING, /* Timer is pending */ }; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f1c4155b49ac..f580dd9db286 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -550,6 +550,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) fn = timer->function; data = timer->data; set_curr_timer(base, timer); + timer->state = HRTIMER_RUNNING; __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); @@ -565,6 +566,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) spin_lock_irq(&base->lock); + /* Another CPU has added back the timer */ + if (timer->state != HRTIMER_RUNNING) + continue; + if (restart == HRTIMER_RESTART) enqueue_hrtimer(timer, base); else -- cgit v1.2.3 From 7978672c4d9a1e6a6081de3a9d9ba5e5b24904a0 Mon Sep 17 00:00:00 2001 From: George Anzinger Date: Wed, 1 Feb 2006 03:05:11 -0800 Subject: [PATCH] hrtimers: cleanups and simplifications Clean up the interface to hrtimers by changing the init code to pass the mode as well as the clock. This allow the init code to select the correct base and eliminates extra timer re-init code in posix-timers. We also simplify the restart interface nanosleep use. Signed-off-by: George Anzinger Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hrtimer.h | 5 ++--- kernel/fork.c | 2 +- kernel/hrtimer.c | 59 ++++++++++++++++++++----------------------------- kernel/posix-timers.c | 37 ++++++++----------------------- 4 files changed, 36 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c657f3d4924a..6361544bb6ae 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -101,9 +101,8 @@ struct hrtimer_base { /* Exported timer functions: */ /* Initialize timers: */ -extern void hrtimer_init(struct hrtimer *timer, const clockid_t which_clock); -extern void hrtimer_rebase(struct hrtimer *timer, const clockid_t which_clock); - +extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, + enum hrtimer_mode mode); /* Basic timer operations: */ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, diff --git a/kernel/fork.c b/kernel/fork.c index 4ae8cfc1c89c..7f0ab5ee948c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -802,7 +802,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC); + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; sig->real_timer.data = tsk; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f580dd9db286..efff9496b2fa 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -66,6 +66,12 @@ EXPORT_SYMBOL_GPL(ktime_get_real); /* * The timer bases: + * + * Note: If we want to add new timer bases, we have to skip the two + * clock ids captured by the cpu-timers. We do this by holding empty + * entries rather than doing math adjustment of the clock ids. + * This ensures that we capture erroneous accesses to these clock ids + * rather than moving them into the range of valid clock id's. */ #define MAX_HRTIMER_BASES 2 @@ -483,29 +489,25 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } /** - * hrtimer_rebase - rebase an initialized hrtimer to a different base + * hrtimer_init - initialize a timer to the given clock * - * @timer: the timer to be rebased + * @timer: the timer to be initialized * @clock_id: the clock to be used + * @mode: timer mode abs/rel */ -void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id) +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) { struct hrtimer_base *bases; + memset(timer, 0, sizeof(struct hrtimer)); + bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); - timer->base = &bases[clock_id]; -} -/** - * hrtimer_init - initialize a timer to the given clock - * - * @timer: the timer to be initialized - * @clock_id: the clock to be used - */ -void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id) -{ - memset(timer, 0, sizeof(struct hrtimer)); - hrtimer_rebase(timer, clock_id); + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) + clock_id = CLOCK_MONOTONIC; + + timer->base = &bases[clock_id]; } /** @@ -643,8 +645,7 @@ schedule_hrtimer_interruptible(struct hrtimer *timer, return schedule_hrtimer(timer, mode); } -static long __sched -nanosleep_restart(struct restart_block *restart, clockid_t clockid) +static long __sched nanosleep_restart(struct restart_block *restart) { struct timespec __user *rmtp; struct timespec tu; @@ -654,7 +655,7 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid) restart->fn = do_no_restart_syscall; - hrtimer_init(&timer, clockid); + hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; @@ -674,16 +675,6 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid) return -ERESTART_RESTARTBLOCK; } -static long __sched nanosleep_restart_mono(struct restart_block *restart) -{ - return nanosleep_restart(restart, CLOCK_MONOTONIC); -} - -static long __sched nanosleep_restart_real(struct restart_block *restart) -{ - return nanosleep_restart(restart, CLOCK_REALTIME); -} - long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { @@ -692,7 +683,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct timespec tu; ktime_t rem; - hrtimer_init(&timer, clockid); + hrtimer_init(&timer, clockid, mode); timer.expires = timespec_to_ktime(*rqtp); @@ -700,7 +691,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, if (rem.tv64 <= 0) return 0; - /* Absolute timers do not update the rmtp value: */ + /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_ABS) return -ERESTARTNOHAND; @@ -710,11 +701,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, return -EFAULT; restart = ¤t_thread_info()->restart_block; - restart->fn = (clockid == CLOCK_MONOTONIC) ? - nanosleep_restart_mono : nanosleep_restart_real; + restart->fn = nanosleep_restart; restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; restart->arg1 = timer.expires.tv64 >> 32; restart->arg2 = (unsigned long) rmtp; + restart->arg3 = (unsigned long) timer.base->index; return -ERESTART_RESTARTBLOCK; } @@ -741,10 +732,8 @@ static void __devinit init_hrtimers_cpu(int cpu) struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++) { + for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) spin_lock_init(&base->lock); - base++; - } } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 3b606d361b52..28e72fd0029f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -194,9 +194,7 @@ static inline int common_clock_set(const clockid_t which_clock, static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock); - new_timer->it.real.timer.data = new_timer; - new_timer->it.real.timer.function = posix_timer_fn; + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); return 0; } @@ -693,6 +691,7 @@ common_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; if (old_setting) common_timer_get(timr, old_setting); @@ -714,14 +713,10 @@ common_timer_set(struct k_itimer *timr, int flags, if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - /* Posix madness. Only absolute CLOCK_REALTIME timers - * are affected by clock sets. So we must reiniatilize - * the timer. - */ - if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME)) - hrtimer_rebase(timer, CLOCK_REALTIME); - else - hrtimer_rebase(timer, CLOCK_MONOTONIC); + mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.data = timr; + timr->it.real.timer.function = posix_timer_fn; timer->expires = timespec_to_ktime(new_setting->it_value); @@ -732,8 +727,7 @@ common_timer_set(struct k_itimer *timr, int flags, if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) return 0; - hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ? - HRTIMER_ABS : HRTIMER_REL); + hrtimer_start(timer, timer->expires, mode); return 0; } @@ -948,21 +942,8 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { - int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; - int clockid = which_clock; - - switch (which_clock) { - case CLOCK_REALTIME: - /* Posix madness. Only absolute timers on clock realtime - are affected by clock set. */ - if (mode != HRTIMER_ABS) - clockid = CLOCK_MONOTONIC; - case CLOCK_MONOTONIC: - break; - default: - return -EINVAL; - } - return hrtimer_nanosleep(tsave, rmtp, mode, clockid); + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_ABS : HRTIMER_REL, which_clock); } asmlinkage long -- cgit v1.2.3 From 66188fae3bf7f8dd951e2291d2a81888ed1b65de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Feb 2006 03:05:13 -0800 Subject: [PATCH] hrtimers: add back lost credit lines At some point we added credits to people who actively helped to bring k/hr-timers along. This was lost in the big code revamp. Add it back. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ktime.h | 6 ++++++ kernel/hrtimer.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 1bd6552cc341..6aca67a569a2 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -10,6 +10,12 @@ * * Started by: Thomas Gleixner and Ingo Molnar * + * Credits: + * + * Roman Zippel provided the ideas and primary code snippets of + * the ktime_t union and further simplifications of the original + * code. + * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index efff9496b2fa..2b6e1757aedd 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -21,6 +21,12 @@ * Credits: * based on kernel/timer.c * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * * For licencing details see kernel-base/COPYING */ -- cgit v1.2.3 From 952bbc87f01f552ef091a62ea2a721b5b2670e74 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 1 Feb 2006 03:05:13 -0800 Subject: [PATCH] hrtimers: set correct initial expiry time for relative SIGEV_NONE timers The expiry time for relative timers with SIGEV_NONE set was never updated to the correct value. Pointed out by George Anzinger. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 28e72fd0029f..aad6f138d5c9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -724,8 +724,13 @@ common_timer_set(struct k_itimer *timr, int flags, timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + /* Setup correct expiry time for relative timers */ + if (mode == HRTIMER_REL) + timer->expires = ktime_add(timer->expires, + timer->base->get_time()); return 0; + } hrtimer_start(timer, timer->expires, mode); return 0; -- cgit v1.2.3 From 493f01d1d0699ddafc30067d33fcc18d0b95b624 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 1 Feb 2006 03:05:14 -0800 Subject: [PATCH] kernel/posix-timers.c: remove do_posix_clock_notimer_create() This function is neither used nor has any real contents. Signed-off-by: Adrian Bunk Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix-timers.h | 1 - kernel/posix-timers.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 54faf5236da0..95572c434bc9 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -84,7 +84,6 @@ struct k_clock { void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); /* error handlers for timer_create, nanosleep and settime */ -int do_posix_clock_notimer_create(struct k_itimer *timer); int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *, struct timespec __user *); int do_posix_clock_nosettime(const clockid_t, struct timespec *tp); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index aad6f138d5c9..216f574b5ffb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -875,12 +875,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) } EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); -int do_posix_clock_notimer_create(struct k_itimer *timer) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); - int do_posix_clock_nonanosleep(const clockid_t clock, int flags, struct timespec *t, struct timespec __user *r) { -- cgit v1.2.3 From 2f7016d917faef8f1e016b4a7bd7f594694480b6 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 1 Feb 2006 03:05:18 -0800 Subject: [PATCH] sys_sched_getaffinity() & hotplug Change sched_getaffinity() so that it returns a bitmap that indicates the legally schedulable cpus that a task is allowed to run on. Without this patch, if CONFIG_HOTPLUG_CPU is enabled, sched_getaffinity() unconditionally returns (at least on IA64) a mask with NR_CPUS bits set. This conveys no useful infornmation except for a kernel compile option. This fixes a breakage we obseved running recent kernels. We have MPI jobs that use sched_getaffinity() to determine where to place their threads. Placing them on non-existant cpus is problematic :-) Signed-off-by: Jack Steiner Acked-by: Ingo Molnar Cc: Nathan Lynch Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec7fd9cee306..f77f23f8f479 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4031,7 +4031,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) goto out_unlock; retval = 0; - cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: read_unlock(&tasklist_lock); -- cgit v1.2.3 From f7b8988ff50d99c99746f65f420364e91362c065 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Feb 2006 03:05:21 -0800 Subject: [PATCH] swsusp: do not change log level during suspend/resume Prevent the kernel from setting the log level to 10 unconditionally during suspend/resume which was needed in the past for debugging, but generally is undesirable. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/console.c | 12 +----------- kernel/power/power.h | 5 +++++ 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index 7ff375e7c95f..579d239d129f 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,18 +9,11 @@ #include #include "power.h" -static int new_loglevel = 10; -static int orig_loglevel; #ifdef SUSPEND_CONSOLE static int orig_fgconsole, orig_kmsg; -#endif int pm_prepare_console(void) { - orig_loglevel = console_loglevel; - console_loglevel = new_loglevel; - -#ifdef SUSPEND_CONSOLE acquire_console_sem(); orig_fgconsole = fg_console; @@ -41,18 +34,15 @@ int pm_prepare_console(void) } orig_kmsg = kmsg_redirect; kmsg_redirect = SUSPEND_CONSOLE; -#endif return 0; } void pm_restore_console(void) { - console_loglevel = orig_loglevel; -#ifdef SUSPEND_CONSOLE acquire_console_sem(); set_console(orig_fgconsole); release_console_sem(); kmsg_redirect = orig_kmsg; -#endif return; } +#endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 61beb5e0e927..d8f0d1a76bae 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -42,8 +42,13 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; +#ifdef SUSPEND_CONSOLE extern int pm_prepare_console(void); extern void pm_restore_console(void); +#else +static int pm_prepare_console(void) { return 0; } +static void pm_restore_console(void) {} +#endif /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -- cgit v1.2.3 From c84db23c6e587d3ab00a41c51fedf758e1f6ecd4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Feb 2006 03:05:29 -0800 Subject: [PATCH] zone_reclaim: minor fixes - If we only reclaim nr_pages then its okay to stay on node. Switch from > to >= for the comparison. - vm_table[] entry for zone_reclaim_mode is a bit screwed up. - Add empty lines around shrink_zone to show that this is the central function to be called. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 ++- mm/vmscan.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb99a42f8b37..c74f03bc0144 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -878,7 +878,8 @@ static ctl_table vm_table[] = { .maxlen = sizeof(zone_reclaim_mode), .mode = 0644, .proc_handler = &proc_dointvec, - .strategy = &zero, + .strategy = &sysctl_intvec, + .extra1 = &zero, }, #endif { .ctl_name = 0 } diff --git a/mm/vmscan.c b/mm/vmscan.c index a29efb2c06c8..61ca0097c834 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1636,14 +1636,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; + shrink_zone(zone, &sc); + p->reclaim_state = NULL; current->flags &= ~PF_MEMALLOC; if (sc.nr_reclaimed == 0) zone->last_unsuccessful_zone_reclaim = jiffies; - return sc.nr_reclaimed > nr_pages; + return sc.nr_reclaimed >= nr_pages; } #endif -- cgit v1.2.3 From 2a11ff06d7d12be5d1bbcf592fff649b45ac2388 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Feb 2006 03:05:33 -0800 Subject: [PATCH] zone_reclaim: configurable off node allocation period. Currently the zone_reclaim code has a fixed window of 30 seconds of off node allocations should a local zone have no unused pagecache pages left. Reclaim will be attempted again after this timeout period to avoid repeated useless scans for memory. This is also useful to established sufficiently large off node allocation chunks to relieve the local node. It may be beneficial to adjust that time period for some special situations. For example if memory use was exceeding node capacity one may want to give up for longer periods of time. If memory spikes intermittendly then one may want to shorten the time period to reduce the number of off node allocations. This patch allows just that.... Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ include/linux/swap.h | 1 + include/linux/sysctl.h | 3 ++- kernel/sysctl.c | 9 +++++++++ mm/vmscan.c | 4 ++-- 5 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 391dd64363e7..44518c023949 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm: - block_dump - drop-caches - zone_reclaim_mode +- zone_reclaim_interval ============================================================== @@ -137,4 +138,15 @@ of memory should be used for caching files from disk. It may be beneficial to switch this on if one wants to do zone reclaim regardless of the numa distances in the system. +================================================================ + +zone_reclaim_interval: + +The time allowed for off node allocations after zone reclaim +has failed to reclaim enough pages to allow a local allocation. + +Time is set in seconds and set by default to 30 seconds. + +Reduce the interval if undesired off node allocations occur. However, too +frequent scans will have a negative impact onoff node allocation performance. diff --git a/include/linux/swap.h b/include/linux/swap.h index 4a99e4a7fbf3..e53fef7051e6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -178,6 +178,7 @@ extern int vm_swappiness; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int zone_reclaim_interval; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 8352a7ce5895..32a4139c4ad8 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -182,7 +182,8 @@ enum VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ - VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c74f03bc0144..71dd6f62efec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -881,6 +881,15 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, #endif { .ctl_name = 0 } }; diff --git a/mm/vmscan.c b/mm/vmscan.c index f8b94ea6f722..8760a4abfa1f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1595,7 +1595,7 @@ int zone_reclaim_mode __read_mostly; /* * Mininum time between zone reclaim scans */ -#define ZONE_RECLAIM_INTERVAL 30*HZ +int zone_reclaim_interval __read_mostly = 30*HZ; /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1617,7 +1617,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) return 0; if (!(gfp_mask & __GFP_WAIT) || -- cgit v1.2.3 From e65cefe87beda627c0bfba39b387ee4bffedc93c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 3 Feb 2006 03:03:42 -0800 Subject: [PATCH] kernel/kprobes.c: fix a warning #ifndef ARCH_SUPPORTS_KRETPROBES kernel/kprobes.c:353: warning: 'pre_handler_kretprobe' defined but not used Signed-off-by: Adrian Bunk Acked-by: Ananth N Mavinakayanahalli Acked-by: "Keshavamurthy, Anil S" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3ea6325228da..95ad7f8db3d6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -344,23 +344,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) spin_unlock_irqrestore(&kretprobe_lock, flags); } -/* - * This kprobe pre_handler is registered with every kretprobe. When probe - * hits it will set up the return probe. - */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long flags = 0; - - /*TODO: consider to only swap the RA after the last pre_handler fired */ - spin_lock_irqsave(&kretprobe_lock, flags); - arch_prepare_kretprobe(rp, regs); - spin_unlock_irqrestore(&kretprobe_lock, flags); - return 0; -} - static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; @@ -578,6 +561,23 @@ void __kprobes unregister_jprobe(struct jprobe *jp) #ifdef ARCH_SUPPORTS_KRETPROBES +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + unsigned long flags = 0; + + /*TODO: consider to only swap the RA after the last pre_handler fired */ + spin_lock_irqsave(&kretprobe_lock, flags); + arch_prepare_kretprobe(rp, regs); + spin_unlock_irqrestore(&kretprobe_lock, flags); + return 0; +} + int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; -- cgit v1.2.3 From 278ff9537030bbb292b33504f5e1f6e0126793eb Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 3 Feb 2006 03:03:43 -0800 Subject: [PATCH] Kprobes: Fix deadlock in function-return probes When two function-return probes are inserted on kfree()[1] and the second on say, sys_link()[2], and later [2] is unregistered, we have a deadlock as kfree is called with the kretprobe_lock held and the function-return probe on kfree will also try to grab the same lock. However, we can move the kfree() during unregistration to outside the spinlock as we are sure that no instances from the free list will be used after synchronized_sched() returns during the unregistration process. Thanks to Masami Hiramatsu for spotting this. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 95ad7f8db3d6..fef1af8a73ce 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -631,12 +631,12 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) unregister_kprobe(&rp->kp); /* No race here */ spin_lock_irqsave(&kretprobe_lock, flags); - free_rp_inst(rp); while ((ri = get_used_rp_inst(rp)) != NULL) { ri->rp = NULL; hlist_del(&ri->uflist); } spin_unlock_irqrestore(&kretprobe_lock, flags); + free_rp_inst(rp); } static int __init init_kprobes(void) -- cgit v1.2.3 From 54e8ce463a7e21dbe9dad57723ed47653ee5db15 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Fri, 3 Feb 2006 03:03:53 -0800 Subject: [PATCH] Tell kallsyms_lookup_name() to ignore type U entries When one module exports a function symbol and another module uses that symbol then kallsyms shows the symbol twice. Once from the consumer with a type of 'U' and once from the provider with a type of 't' or 'T'. On most architectures, both entries have the same address so it does not matter which one is returned by kallsyms_lookup_name(). But on architectures with function descriptors, the 'U' entry points to the descriptor, not to the code body, which is not what we want. IA64 # grep -w qla2x00_remove_one /proc/kallsyms a000000208c25ef8 U qla2x00_remove_one [qla2300] <= descriptor a000000208bf44c0 t qla2x00_remove_one [qla2xxx] <= function body Tell kallsyms_lookup_name() to ignore type U entries in modules. Signed-off-by: Keith Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 618ed6e23ecc..e058aedf6b93 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2092,7 +2092,8 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) unsigned int i; for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0) + if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && + mod->symtab[i].st_info != 'U') return mod->symtab[i].st_value; return 0; } -- cgit v1.2.3 From 88fc3897e3219e63ae6e2d180a6c87d033ef9f3b Mon Sep 17 00:00:00 2001 From: George Anzinger Date: Fri, 3 Feb 2006 03:04:20 -0800 Subject: [PATCH] Normalize timespec for negative values in ns_to_timespec - In case of a negative nsec value the result of the division must be normalized. - Remove inline from an exported function. Signed-off-by: George Anzinger Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 1f23e683d6aa..804539165d8b 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -637,15 +637,16 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) * * Returns the timespec representation of the nsec parameter. */ -inline struct timespec ns_to_timespec(const nsec_t nsec) +struct timespec ns_to_timespec(const nsec_t nsec) { struct timespec ts; - if (nsec) - ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, - &ts.tv_nsec); - else - ts.tv_sec = ts.tv_nsec = 0; + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); + if (unlikely(nsec < 0)) + set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); return ts; } -- cgit v1.2.3 From fe85a998ca64a067e58ca9240ec54a95994d78ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Feb 2006 03:04:23 -0800 Subject: [PATCH] cpuset: fix sparse warning kernel/cpuset.c:644:38: warning: non-ANSI function declaration of function 'cpuset_update_task_memory_state' Signed-off-by: Randy Dunlap Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fe2f71f92ae0..ba42b0a76961 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -641,7 +641,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -void cpuset_update_task_memory_state() +void cpuset_update_task_memory_state(void) { int my_cpusets_mem_gen; struct task_struct *tsk = current; -- cgit v1.2.3 From 514a01b880d28a3029d9e35de72ad8d2f95b31d0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Feb 2006 03:04:41 -0800 Subject: [PATCH] uninline __sigqueue_free() Five callsites. I dunno how all this crap got back in there :( Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d3efafd8109a..b373fc2420da 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -283,7 +283,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, return(q); } -static inline void __sigqueue_free(struct sigqueue *q) +static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; -- cgit v1.2.3 From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Feb 2006 23:27:36 -0800 Subject: [PATCH] percpu data: only iterate over possible CPUs percpu_data blindly allocates bootmem memory to store NR_CPUS instances of cpudata, instead of allocating memory only for possible cpus. As a preparation for changing that, we need to convert various 0 -> NR_CPUS loops to use for_each_cpu(). (The above only applies to users of asm-generic/percpu.h. powerpc has gone it alone and is presently only allocating memory for present CPUs, so it's currently corrupting memory). Signed-off-by: Eric Dumazet Cc: "David S. Miller" Cc: James Bottomley Acked-by: Ingo Molnar Cc: Jens Axboe Cc: Anton Blanchard Acked-by: William Irwin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/nmi.c | 2 +- block/ll_rw_blk.c | 2 +- drivers/scsi/scsi.c | 2 +- fs/file.c | 3 +-- kernel/sched.c | 2 +- mm/page_alloc.c | 10 ++++++---- net/core/dev.c | 2 +- net/core/utils.c | 4 ++-- net/ipv4/proc.c | 2 +- net/ipv6/proc.c | 2 +- net/socket.c | 2 +- 11 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d661703ac1cb..63f39a7e2c96 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -138,7 +138,7 @@ static int __init check_nmi_watchdog(void) if (nmi_watchdog == NMI_LOCAL_APIC) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); mdelay((10*1000)/nmi_hz); // wait 10 ticks diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index f9fc07efd2da..e5aad8314585 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3453,7 +3453,7 @@ int __init blk_dev_init(void) iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 245ca99a641e..c551bb84dbfb 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -1245,7 +1245,7 @@ static int __init init_scsi(void) if (error) goto cleanup_sysctl; - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) INIT_LIST_HEAD(&per_cpu(scsi_done_q, i)); devfs_mk_dir("scsi"); diff --git a/fs/file.c b/fs/file.c index fd066b261c75..cea7cbea11d0 100644 --- a/fs/file.c +++ b/fs/file.c @@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu) void __init files_defer_init(void) { int i; - /* Really early - can't use for_each_cpu */ - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) fdtable_defer_list_init(i); } diff --git a/kernel/sched.c b/kernel/sched.c index f77f23f8f479..839466fdfb4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6109,7 +6109,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { prio_array_t *array; rq = cpu_rq(i); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44b4eb4202d9..dde04ff4be31 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; - memset(ret, 0, sizeof(*ret)); + memset(ret, 0, nr * sizeof(unsigned long)); cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { unsigned long *in, *out, off; + if (!cpu_isset(cpu, *cpumask)) + continue; + in = (unsigned long *)&per_cpu(page_states, cpu); cpu = next_cpu(cpu, *cpumask); - if (cpu < NR_CPUS) + if (likely(cpu < NR_CPUS)) prefetch(&per_cpu(page_states, cpu)); out = (unsigned long *)ret; @@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static struct per_cpu_pageset - boot_pageset[NR_CPUS]; +static struct per_cpu_pageset boot_pageset[NR_CPUS]; /* * Dynamically allocate memory for the diff --git a/net/core/dev.c b/net/core/dev.c index ffb82073056e..2afb0de95329 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3237,7 +3237,7 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { struct softnet_data *queue; queue = &per_cpu(softnet_data, i); diff --git a/net/core/utils.c b/net/core/utils.c index ac1d1fcf8673..fdc4f38bc46c 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -121,7 +121,7 @@ void __init net_random_init(void) { int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { struct nrnd_state *state = &per_cpu(net_rand_state,i); __net_srandom(state, i+jiffies); } @@ -133,7 +133,7 @@ static int net_random_reseed(void) unsigned long seed[NR_CPUS]; get_random_bytes(seed, sizeof(seed)); - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { struct nrnd_state *state = &per_cpu(net_rand_state,i); __net_srandom(state, seed[i]); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 39d49dc333a7..1b167c4bb3be 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -49,7 +49,7 @@ static int fold_prot_inuse(struct proto *proto) int res = 0; int cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_cpu(cpu) res += proto->stats[cpu].inuse; return res; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 50a13e75d70e..4238b1ed8860 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -38,7 +38,7 @@ static int fold_prot_inuse(struct proto *proto) int res = 0; int cpu; - for (cpu=0; cpustats[cpu].inuse; return res; diff --git a/net/socket.c b/net/socket.c index b38a263853c3..a00851f981db 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2078,7 +2078,7 @@ void socket_seq_show(struct seq_file *seq) int cpu; int counter = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_cpu(cpu) counter += per_cpu(sockets_in_use, cpu); /* It can be negative, by the way. 8) */ -- cgit v1.2.3 From bd576c9523fbf23e94fb7dbe05d2ae1cf96864e4 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Sat, 4 Feb 2006 23:27:42 -0800 Subject: [PATCH] sched: only print migration_cost once per boot migration_cost prints after every CPU hotplug event. Make it print only once at boot. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 839466fdfb4c..bc38804e40dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5551,13 +5551,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); + if (system_state == SYSTEM_BOOTING) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); } - printk("\n"); j1 = jiffies; if (migration_debug) printk("migration: %ld seconds\n", (j1-j0)/HZ); -- cgit v1.2.3 From 5c0d5d262aa4c5e93f9f5de298cf25d6d8b558c4 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sat, 4 Feb 2006 23:27:49 -0800 Subject: [PATCH] missing license tag in intermodule It may suck something awful, but it shouldn't taint the kernel. Signed-off-by: Dave Jones Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/intermodule.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 0cbe633420fb..55b1e5b85db9 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -179,3 +179,6 @@ EXPORT_SYMBOL(inter_module_register); EXPORT_SYMBOL(inter_module_unregister); EXPORT_SYMBOL(inter_module_get_request); EXPORT_SYMBOL(inter_module_put); + +MODULE_LICENSE("GPL"); + -- cgit v1.2.3