Re: 48GB NUMA-Q boots, with major IO-APIC hassles

William Lee Irwin III (wli@holomorphy.com)
Sat, 18 Jan 2003 17:50:13 -0800


On Wed, Jan 15, 2003 at 02:58:02AM -0800, William Lee Irwin III wrote:
>> (1) I've got 320 IRQ sources. This panic()'s in setup_IO_APIC_irqs().

On Sat, Jan 18, 2003 at 05:43:26PM -0800, William Lee Irwin III wrote:
> Where do you go for IO-APIC issues? Well, my MP tables say (Zwane
> pointed out fsmp@FreeBSD.org's mptable code):

Okay, and here is my latest attempt to deal with the issue (which is
dirty as sin code-wise, but nm that... I'm trying to debug this).

This doesn't actually work, I end up deadlocking presumably because
everything's waiting for an interrupt that's been dropped at some point.

diff -urpN mm1-2.5.59/arch/i386/kernel/io_apic.c irq-2.5.59-1/arch/i386/kernel/io_apic.c
--- mm1-2.5.59/arch/i386/kernel/io_apic.c 2003-01-17 01:04:43.000000000 -0800
+++ irq-2.5.59-1/arch/i386/kernel/io_apic.c 2003-01-18 15:15:01.000000000 -0800
@@ -647,6 +647,7 @@ static int __init find_irq_entry(int api

for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].mpc_irqtype == type &&
+ mp_bus_id_to_node[mp_irqs[i].mpc_srcbus] == apic/2 &&
(mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
mp_irqs[i].mpc_dstirq == pin)
@@ -696,8 +697,9 @@ int IO_APIC_get_PCI_irq_vector(int bus,
int lbus = mp_irqs[i].mpc_srcbus;

for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if ((mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+ mp_bus_id_to_node[mp_irqs[i].mpc_srcbus] == apic/2)
break;

if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
@@ -914,6 +916,12 @@ static int pin_2_irq(int idx, int apic,
int irq, i;
int bus = mp_irqs[idx].mpc_srcbus;

+#ifdef CONFIG_X86_NUMAQ
+ if (mp_bus_id_to_node[bus] != apic/2)
+ printk(KERN_ERR "bus %d on node %d, apic %d on node %d\n",
+ bus, mp_bus_id_to_node[bus], apic, apic/2);
+#endif
+
/*
* Debugging check, we are in big trouble if this message pops up!
*/
@@ -930,6 +938,10 @@ static int pin_2_irq(int idx, int apic,
break;
}
case MP_BUS_PCI: /* PCI pin */
+#ifdef CONFIG_X86_NUMAQ
+ irq = apic_pin_to_irq[apic][pin];
+ break;
+#else
{
/*
* PCI IRQs are mapped in order
@@ -940,6 +952,7 @@ static int pin_2_irq(int idx, int apic,
irq += pin;
break;
}
+#endif
default:
{
printk(KERN_ERR "unknown bus type %d.\n",bus);
@@ -984,6 +997,80 @@ static inline int IO_APIC_irq_trigger(in

int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };

+#ifdef CONFIG_X86_NUMAQ
+
+int vector_to_irq[MAX_NUMNODES][FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR + 1];
+int apic_pin_to_irq[MAX_IO_APICS][24];
+
+/*
+ * timer vectors must always go to 0
+ * vectors < FIRST_DEVICE_VECTOR are 1:1
+ * everything else goes through the table
+ */
+
+static void __init init_vector_to_irq(void)
+{
+ int n, v;
+ for (n = 0; n < MAX_NUMNODES; ++n) {
+ for (v = 1; v <= FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR; ++v)
+ vector_to_irq[n][v] = -1;
+ vector_to_irq[n][0] = 0;
+ }
+ for (n = 0; n < MAX_IO_APICS; ++n)
+ for (v = 0; v < 24; ++v)
+ apic_pin_to_irq[n][v] = -1;
+}
+
+int irq_of_vector(int vector)
+{
+ int irq;
+ if (vector < FIRST_DEVICE_VECTOR)
+ irq = vector;
+ else
+ irq = vector_to_irq[numa_node_id()][vector-FIRST_DEVICE_VECTOR];
+ return irq;
+}
+
+static void set_irq_of_vector(int apic, int vector, int irq)
+{
+ vector_to_irq[apic/2][vector-FIRST_DEVICE_VECTOR] = irq;
+}
+
+static void set_irq_of_pin(int apic, int pin, int irq)
+{
+ apic_pin_to_irq[apic][pin] = irq;
+}
+
+static int __init next_irq_vector(int vector)
+{
+ ++vector;
+ if (vector >= FIRST_SYSTEM_VECTOR)
+ vector = FIRST_DEVICE_VECTOR + 1;
+ else if (vector == SYSCALL_VECTOR)
+ ++vector;
+ return vector;
+}
+
+static int __init assign_irq_vector(int irq)
+{
+ static int current_vector = FIRST_DEVICE_VECTOR+1;
+ if (!irq)
+ return FIRST_DEVICE_VECTOR;
+ else if (!irq_vector[irq]) {
+ irq_vector[irq] = current_vector;
+ current_vector = next_irq_vector(current_vector);
+ }
+ return irq_vector[irq];
+}
+
+#else
+
+#define init_vector_to_irq() do {} while (0)
+#define set_irq_of_vector(a,v,i) do {} while (0)
+#define set_irq_of_pin(a,v,i) do {} while (0)
+
+int irq_of_vector(int vector) { return vector; }
+
static int __init assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
@@ -1005,6 +1092,7 @@ next:
IO_APIC_VECTOR(irq) = current_vector;
return current_vector;
}
+#endif

static struct hw_interrupt_type ioapic_level_irq_type;
static struct hw_interrupt_type ioapic_edge_irq_type;
@@ -1017,6 +1105,8 @@ void __init setup_IO_APIC_irqs(void)

printk(KERN_DEBUG "init IO_APIC IRQs\n");

+ init_vector_to_irq();
+
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {

@@ -1064,13 +1154,19 @@ void __init setup_IO_APIC_irqs(void)
if (IO_APIC_IRQ(irq)) {
vector = assign_irq_vector(irq);
entry.vector = vector;
+ set_irq_of_vector(apic, vector, irq);
+ set_irq_of_pin(apic, pin, irq);

if (IO_APIC_irq_trigger(irq))
irq_desc[irq].handler = &ioapic_level_irq_type;
else
irq_desc[irq].handler = &ioapic_edge_irq_type;

+#ifdef CONFIG_X86_NUMAQ
+ set_intr_gate(vector, interrupt[vector]);
+#else
set_intr_gate(vector, interrupt[irq]);
+#endif

if (!apic && (irq < 16))
disable_8259A_irq(irq);
@@ -1457,6 +1553,7 @@ static void __init setup_ioapic_ids_from
* Set the IOAPIC ID to the value stored in the MPC table.
*/
for (apic = 0; apic < nr_ioapics; apic++) {
+ unsigned long numaq_ioapic_id;

/* Read the register 0 value */
spin_lock_irqsave(&ioapic_lock, flags);
@@ -1465,6 +1562,7 @@ static void __init setup_ioapic_ids_from

old_id = mp_ioapics[apic].mpc_apicid;

+#ifndef CONFIG_X86_NUMAQ
if (mp_ioapics[apic].mpc_apicid >= APIC_BROADCAST_ID) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
apic, mp_ioapics[apic].mpc_apicid);
@@ -1495,6 +1593,7 @@ static void __init setup_ioapic_ids_from
printk("Setting %d in the phys_id_present_map\n", mp_ioapics[apic].mpc_apicid);
phys_id_present_map |= 1 << mp_ioapics[apic].mpc_apicid;
}
+#endif /* CONFIG_X86_NUMAQ */


/*
@@ -1507,14 +1606,19 @@ static void __init setup_ioapic_ids_from
mp_irqs[i].mpc_dstapic
= mp_ioapics[apic].mpc_apicid;

+#ifdef CONFIG_X86_NUMAQ
+ numaq_ioapic_id = (mp_ioapics[apic].mpc_apicid & 1) ? 13 : 14;
+#else
+ numaq_ioapic_id = mp_ioapics[apic].mpc_apicid;
+#endif /* CONFIG_X86_NUMAQ */
+
/*
* Read the right value from the MPC table and
* write it into the ID register.
*/
- printk(KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
+ printk(KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", (int)numaq_ioapic_id);

- reg_00.ID = mp_ioapics[apic].mpc_apicid;
+ reg_00.ID = numaq_ioapic_id;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, *(int *)&reg_00);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1525,7 +1629,7 @@ static void __init setup_ioapic_ids_from
spin_lock_irqsave(&ioapic_lock, flags);
*(int *)&reg_00 = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.ID != mp_ioapics[apic].mpc_apicid)
+ if (reg_00.ID != numaq_ioapic_id)
panic("could not set ID!\n");
else
printk(" ok.\n");
@@ -1741,7 +1845,7 @@ static struct hw_interrupt_type ioapic_l
set_ioapic_affinity,
};

-static inline void init_IO_APIC_traps(void)
+static void init_IO_APIC_traps(void)
{
int irq;

@@ -1765,9 +1869,12 @@ static inline void init_IO_APIC_traps(vo
*/
if (irq < 16)
make_8259A_irq(irq);
- else
+ else {
/* Strange. Oh, well.. */
irq_desc[irq].handler = &no_irq_type;
+ printk("init_IO_APIC_traps():"
+ "unhandled irq %d\n", irq);
+ }
}
}
}
@@ -1915,7 +2022,11 @@ static inline void check_timer(void)
*/
disable_8259A_irq(0);
vector = assign_irq_vector(0);
+#ifdef CONFIG_X86_NUMAQ
+ set_intr_gate(vector, interrupt[vector]);
+#else
set_intr_gate(vector, interrupt[0]);
+#endif

/*
* Subtle, code in do_timer_interrupt() expects an AEOI
diff -urpN mm1-2.5.59/arch/i386/kernel/irq.c irq-2.5.59-1/arch/i386/kernel/irq.c
--- mm1-2.5.59/arch/i386/kernel/irq.c 2003-01-17 01:04:43.000000000 -0800
+++ irq-2.5.59-1/arch/i386/kernel/irq.c 2003-01-18 11:11:40.000000000 -0800
@@ -69,6 +69,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline
{ [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};

static void register_irq_proc (unsigned int irq);
+int irq_of_vector(int);

/*
* Special irq handlers.
@@ -92,6 +93,7 @@ static void ack_none(unsigned int irq)
*/
#if CONFIG_X86
printk("unexpected IRQ trap at vector %02x\n", irq);
+ dump_stack();
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Currently unexpected vectors happen only on SMP and APIC.
@@ -323,12 +325,19 @@ asmlinkage unsigned int do_IRQ(struct pt
* 0 return value means that this irq is already being
* handled by some other CPU. (or is disabled)
*/
- int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */
+ /* high bits used in ret_from_ code */
+ int irq = irq_of_vector(regs.orig_eax & 0xff);
int cpu = smp_processor_id();
irq_desc_t *desc = irq_desc + irq;
struct irqaction * action;
unsigned int status;

+ if (irq < 0) {
+ printk("bad vector %ld, irq %d\n", regs.orig_eax & 0xff, irq);
+ dump_stack();
+ return 1;
+ }
+
irq_enter();

#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff -urpN mm1-2.5.59/arch/i386/pci/numa.c irq-2.5.59-1/arch/i386/pci/numa.c
--- mm1-2.5.59/arch/i386/pci/numa.c 2003-01-16 18:21:44.000000000 -0800
+++ irq-2.5.59-1/arch/i386/pci/numa.c 2003-01-18 11:11:40.000000000 -0800
@@ -117,6 +117,14 @@ struct pci_fixup pcibios_fixups[] = {
{ PCI_FIXUP_HEADER, PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx },
};

+void pci_fixup_child(struct pci_bus *parent, struct pci_bus *child, unsigned int buses)
+{
+ int quad = BUS2QUAD(parent->number);
+ child->primary = QUADLOCAL2BUS(quad, buses & 0xFF);
+ child->secondary = QUADLOCAL2BUS(quad, (buses >> 8) & 0xFF);
+ child->subordinate = QUADLOCAL2BUS(quad, (buses >> 16) & 0xFF);
+}
+
static int __init pci_numa_init(void)
{
int quad;
@@ -127,7 +135,7 @@ static int __init pci_numa_init(void)
return 0;

pci_root_bus = pcibios_scan_root(0);
- if (numnodes > 1) {
+ if (0 && numnodes > 1) {
for (quad = 1; quad < numnodes; ++quad) {
printk("Scanning PCI bus %d for quad %d\n",
QUADLOCAL2BUS(quad,0), quad);
diff -urpN mm1-2.5.59/drivers/pci/probe.c irq-2.5.59-1/drivers/pci/probe.c
--- mm1-2.5.59/drivers/pci/probe.c 2003-01-16 18:22:24.000000000 -0800
+++ irq-2.5.59-1/drivers/pci/probe.c 2003-01-18 11:11:40.000000000 -0800
@@ -244,6 +244,17 @@ struct pci_bus * __devinit pci_add_new_b
return child;
}

+#ifdef CONFIG_X86_NUMAQ
+void pci_fixup_child(struct pci_bus *, struct pci_bus *, int);
+#else
+void pci_fixup_child(struct pci_bus *parent, struct pci_bus *child, unsigned int buses)
+{
+ child->primary = buses & 0xFF;
+ child->secondary = (buses >> 8) & 0xFF;
+ child->subordinate = (buses >> 16) & 0xFF;
+}
+#endif
+
/*
* If it's a bridge, configure it and scan the bus behind it.
* For CardBus bridges, we don't scan behind as the devices will
@@ -271,9 +282,7 @@ int __devinit pci_scan_bridge(struct pci
if (pass)
return max;
child = pci_add_new_bus(bus, dev, 0);
- child->primary = buses & 0xFF;
- child->secondary = (buses >> 8) & 0xFF;
- child->subordinate = (buses >> 16) & 0xFF;
+ pci_fixup_child(bus, child, buses);
child->number = child->secondary;
cmax = pci_do_scan_bus(child);
if (cmax > max) max = cmax;
diff -urpN mm1-2.5.59/include/asm-i386/hardirq.h irq-2.5.59-1/include/asm-i386/hardirq.h
--- mm1-2.5.59/include/asm-i386/hardirq.h 2003-01-16 18:22:26.000000000 -0800
+++ irq-2.5.59-1/include/asm-i386/hardirq.h 2003-01-18 11:11:40.000000000 -0800
@@ -34,7 +34,11 @@ typedef struct {

#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
+#ifdef CONFIG_X86_NUMAQ
+#define HARDIRQ_BITS 9
+#else
#define HARDIRQ_BITS 8
+#endif

#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
diff -urpN mm1-2.5.59/include/asm-i386/mach-numaq/irq_vectors.h irq-2.5.59-1/include/asm-i386/mach-numaq/irq_vectors.h
--- mm1-2.5.59/include/asm-i386/mach-numaq/irq_vectors.h 1969-12-31 16:00:00.000000000 -0800
+++ irq-2.5.59-1/include/asm-i386/mach-numaq/irq_vectors.h 2003-01-18 11:11:40.000000000 -0800
@@ -0,0 +1,81 @@
+/*
+ * This file should contain #defines for all of the interrupt vector
+ * numbers used by this architecture.
+ *
+ * In addition, there are some standard defines:
+ *
+ * FIRST_EXTERNAL_VECTOR:
+ * The first free place for external interrupts
+ *
+ * SYSCALL_VECTOR:
+ * The IRQ vector a syscall makes the user to kernel transition
+ * under.
+ *
+ * TIMER_IRQ:
+ * The IRQ number the timer interrupt comes in at.
+ *
+ * NR_IRQS:
+ * The total number of interrupt vectors (including all the
+ * architecture specific interrupts) needed.
+ *
+ */
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+#define SYSCALL_VECTOR 0x80
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+#define INVALIDATE_TLB_VECTOR 0xfd
+#define RESCHEDULE_VECTOR 0xfc
+#define CALL_FUNCTION_VECTOR 0xfb
+
+#define THERMAL_APIC_VECTOR 0xf0
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR 0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x31 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR 0x31
+#define FIRST_SYSTEM_VECTOR 0xef
+
+#define TIMER_IRQ 0
+
+/*
+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
+ * Right now the APIC is mostly only used for SMP.
+ * 256 vectors is an architectural limit. (we can have
+ * more than 256 devices theoretically, but they will
+ * have to use shared interrupts)
+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
+ * the usable vector space is 0x20-0xff (224 vectors)
+ */
+#define NR_IRQS 512
+
+#endif /* _ASM_IRQ_VECTORS_H */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/