Re: 2.5.68-mmX: Drowning in irq 7: nobody cared!

Andrew Morton (akpm@digeo.com)
Tue, 6 May 2003 08:17:16 -0700


Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>
> With APIC at least it doesnt suprise me the least. The IRQ hack seems
> extremely racey.

Good point. How about we do something like "if half of the past 1000
interrupts weren't handled then try to kill the IRQ"?

arch/i386/kernel/irq.c | 55 +++++++++++++++++++++++++++++++++++++++----------
include/linux/irq.h | 2 +
2 files changed, 46 insertions(+), 11 deletions(-)

diff -puN arch/i386/kernel/irq.c~irq-check-rate-limit arch/i386/kernel/irq.c
--- 25/arch/i386/kernel/irq.c~irq-check-rate-limit 2003-05-06 07:54:17.000000000 -0700
+++ 25-akpm/arch/i386/kernel/irq.c 2003-05-06 08:16:15.000000000 -0700
@@ -66,8 +66,12 @@
/*
* Controller mappings for all interrupt sources:
*/
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
- { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+ [0 ... NR_IRQS-1] = {
+ .handler = &no_irq_type,
+ .lock = SPIN_LOCK_UNLOCKED
+ }
+};

static void register_irq_proc (unsigned int irq);

@@ -209,7 +213,6 @@ int handle_IRQ_event(unsigned int irq,
{
int status = 1; /* Force the "do bottom halves" bit */
int retval = 0;
- struct irqaction *first_action = action;

if (!(action->flags & SA_INTERRUPT))
local_irq_enable();
@@ -222,19 +225,43 @@ int handle_IRQ_event(unsigned int irq,
if (status & SA_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
local_irq_disable();
- if (retval != 1) {
+ return status;
+}
+
+/*
+ * If 500 of the previous 1000 interrupts have not been handled then assume
+ * that the IRQ is stuck in some manner. Drop a diagnostic and try to turn the
+ * IRQ off.
+ *
+ * Called under desc->lock
+ */
+static void note_interrupt(irq_desc_t *desc, int irq, int status)
+{
+ if (status != IRQ_HANDLED)
+ desc->irqs_unhandled++;
+ desc->irq_count++;
+ if (desc->irq_count < 1000)
+ return;
+
+ desc->irq_count = 0;
+ if (desc->irqs_unhandled > 500) {
+ /*
+ * The interrupt is stuck
+ */
static int count = 100;
+ struct irqaction *action;
+
if (count) {
count--;
- if (retval) {
+ if (status) {
printk("irq event %d: bogus retval mask %x\n",
- irq, retval);
+ irq, status);
} else {
printk("irq %d: nobody cared!\n", irq);
}
dump_stack();
printk("handlers:\n");
- action = first_action;
+ action = desc->action;
do {
printk("[<%p>]", action->handler);
print_symbol(" (%s)",
@@ -243,9 +270,13 @@ int handle_IRQ_event(unsigned int irq,
action = action->next;
} while (action);
}
+ /*
+ * Now kill the IRQ
+ */
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
}
-
- return status;
+ desc->irqs_unhandled = 0;
}

/*
@@ -418,10 +449,12 @@ asmlinkage unsigned int do_IRQ(struct pt
* SMP environment.
*/
for (;;) {
+ int status;
+
spin_unlock(&desc->lock);
- handle_IRQ_event(irq, &regs, action);
+ status = handle_IRQ_event(irq, &regs, action);
spin_lock(&desc->lock);
-
+ note_interrupt(desc, irq, status);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
diff -puN include/linux/irq.h~irq-check-rate-limit include/linux/irq.h
--- 25/include/linux/irq.h~irq-check-rate-limit 2003-05-06 07:56:03.000000000 -0700
+++ 25-akpm/include/linux/irq.h 2003-05-06 08:05:17.000000000 -0700
@@ -61,6 +61,8 @@ typedef struct {
hw_irq_controller *handler;
struct irqaction *action; /* IRQ action list */
unsigned int depth; /* nested irq disables */
+ unsigned int irq_count; /* For detecting broken interrupts */
+ unsigned int irqs_unhandled;
spinlock_t lock;
} ____cacheline_aligned irq_desc_t;

_

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/