[PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Ingo Molnar 2005-09-06 15:16:27 -07:00 committed by Linus Torvalds
parent 4732efbeb9
commit 8446f1d391
12 changed files with 201 additions and 0 deletions

View file

@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
*/
for (i = 0; i < NR_CPUS; i++)
alert_counter[i] = 0;
/*
* Tickle the softlockup detector too:
*/
touch_softlockup_watchdog();
}
extern void die_nmi(struct pt_regs *, const char *msg);

View file

@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
last_timer->resume();
cur_timer = last_timer;
last_timer = NULL;
touch_softlockup_watchdog();
return 0;
}

View file

@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
*/
for (i = 0; i < NR_CPUS; i++)
per_cpu(nmi_touch, i) = 1;
touch_softlockup_watchdog();
}
void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)

View file

@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
write_sequnlock_irqrestore(&xtime_lock,flags);
jiffies += sleep_length;
wall_jiffies += sleep_length;
touch_softlockup_watchdog();
return 0;
}

View file

@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
do {
if (this->dev_ready(mtd))
return;
touch_softlockup_watchdog();
} while (time_before(jiffies, timeo));
}

View file

@ -176,6 +176,23 @@ extern void trap_init(void);
extern void update_process_times(int user);
extern void scheduler_tick(void);
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(struct pt_regs *regs);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
#else
static inline void softlockup_tick(struct pt_regs *regs)
{
}
static inline void spawn_softlockup_task(void)
{
}
static inline void touch_softlockup_watchdog(void)
{
}
#endif
/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
/* Is this address in the __sched functions? */

View file

@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
migration_init();
#endif
spawn_ksoftirqd();
spawn_softlockup_task();
}
static void run_init_process(char *init_filename)

View file

@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o

View file

@ -1059,6 +1059,7 @@ int swsusp_resume(void)
BUG_ON(!error);
restore_processor_state();
restore_highmem();
touch_softlockup_watchdog();
device_power_up();
local_irq_enable();
return error;

151
kernel/softlockup.c Normal file
View file

@ -0,0 +1,151 @@
/*
* Detect Soft Lockups
*
* started by Ingo Molnar, (C) 2005, Red Hat
*
* this code detects soft lockups: incidents in where on a CPU
* the kernel does not reschedule for 10 seconds or more.
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/notifier.h>
#include <linux/module.h>
static DEFINE_SPINLOCK(print_lock);
static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic = 0;
static int softlock_panic(struct notifier_block *this, unsigned long event,
void *ptr)
{
did_panic = 1;
return NOTIFY_DONE;
}
static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};
void touch_softlockup_watchdog(void)
{
per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
*/
void softlockup_tick(struct pt_regs *regs)
{
int this_cpu = smp_processor_id();
unsigned long timestamp = per_cpu(timestamp, this_cpu);
if (per_cpu(print_timestamp, this_cpu) == timestamp)
return;
/* Do not cause a second panic when there already was one */
if (did_panic)
return;
if (time_after(jiffies, timestamp + 10*HZ)) {
per_cpu(print_timestamp, this_cpu) = timestamp;
spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
this_cpu);
show_regs(regs);
spin_unlock(&print_lock);
}
}
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void * __bind_cpu)
{
struct sched_param param = { .sched_priority = 99 };
int this_cpu = (long) __bind_cpu;
printk("softlockup thread %d started up.\n", this_cpu);
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second - if this gets delayed for
* more than 10 seconds then the debug-printout triggers
* in softlockup_tick():
*/
while (!kthread_should_stop()) {
msleep_interruptible(1000);
touch_softlockup_watchdog();
}
__set_current_state(TASK_RUNNING);
return 0;
}
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
static int __devinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
struct task_struct *p;
switch (action) {
case CPU_UP_PREPARE:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
case CPU_DEAD:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
notifier_chain_register(&panic_notifier_list, &panic_block);
}

View file

@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
{
jiffies_64++;
update_times();
softlockup_tick(regs);
}
#ifdef __ARCH_WANT_SYS_ALARM

View file

@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
13 => 8 KB
12 => 4 KB
config DETECT_SOFTLOCKUP
bool "Detect Soft Lockups"
depends on DEBUG_KERNEL
default y
help
Say Y here to enable the kernel to detect "soft lockups",
which are bugs that cause the kernel to loop in kernel
mode for more than 10 seconds, without giving other tasks a
chance to run.
When a soft-lockup is detected, the kernel will print the
current stack trace (which you should report), but the
system will stay locked up. This feature has negligible
overhead.
(Note that "hard lockups" are separate type of bugs that
can be detected via the NMI-watchdog, on platforms that
support it.)
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS