Re: lockups

From: Bruce Evans <bde_at_zeta.org.au>
Date: Sat, 15 May 2004 08:32:04 +1000 (EST)
On Fri, 14 May 2004, John Baldwin wrote:

> On Thursday 13 May 2004 11:35 pm, Jason King wrote:
> > I'm having some problems with -CURRENT locking up, I'm hoping maybe
> > someone will have some suggestions.
> >
> > Symptoms:
> >
> > (This is both on 5.2.1-CURRENT as well as booting off the 5.2.1-RELEASE
> > cdrom)
> >
> > During normal boot, kernel freezes before it even starts init.
> > Ctrl-Alt-Delete does not work, power button must be used to reboot.
> >
> > boot -v reveals 'Interrupt storm on "dc0"; throttling interrupt
> > source'.  Thinking it might be a bad network card (though it works fine
> > in XP and worked fine when I had 5.1-RELEASE installed), I removed the
> > card, and I got the same error, just on a different device (pcm0).
>
> Ok, this would explain the slow boot w/o ACPI as well if interrupts are not
> routed correctly.  Does the machine boot ok if you do 'set
> hint.apic.0.disabled=1' from the boot loader?

Here are my current quick fixes for interrupt storm handling on another
nForce2 system (A7N8X-E) which has interrupt storms on all interrupts
>= 16.  Interrupt storms are often not detected because the next
interrupt doesn't happen until a little after the loop exits, and when
they are detected you wish that they weren't because non-storming
interrupts (with the same interrupt number as storming ones) are
throttled to a very low rate too.  The DELAY() in kern_intr.c may be
unnecessary now that there is a DELAY() in intr_machdep.c.  A
fully-storming interrupt is supposed to be reduced to polling after
every clock tick, but without the latter only every second fully-storming
interrupt was detected as storming.  Without either, only every Nth
(N large) fully-storming interrupt was detected as storming (probably
due to accidental delays for other interrupt handling).

%%%
Index: kern/kern_intr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_intr.c,v
retrieving revision 1.108
diff -u -2 -r1.108 kern_intr.c
--- kern/kern_intr.c	17 Apr 2004 02:46:05 -0000	1.108
+++ kern/kern_intr.c	24 Apr 2004 14:57:39 -0000
_at__at_ -39,4 +39,5 _at__at_
 #include <sys/kthread.h>
 #include <sys/ktr.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
_at__at_ -495,5 +496,5 _at__at_
 	struct thread *td;
 	struct proc *p;
-	int count, warned;
+	int count, warming, warned;

 	td = curthread;
_at__at_ -502,5 +503,5 _at__at_
 	KASSERT(ithd->it_td == td && td->td_ithd == ithd,
 	    ("%s: ithread and proc linkage out of sync", __func__));
-	count = 0;
+	warming = 10 * intr_storm_threshold;
 	warned = 0;

_at__at_ -524,4 +525,5 _at__at_
 		CTR4(KTR_INTR, "%s: pid %d: (%s) need=%d", __func__,
 		     p->p_pid, p->p_comm, ithd->it_need);
+		count = 0;
 		while (ithd->it_need) {
 			/*
_at__at_ -532,23 +534,4 _at__at_
 			 */
 			atomic_store_rel_int(&ithd->it_need, 0);
-
-			/*
-			 * If we detect an interrupt storm, pause with
-			 * the source masked for 1/10th of a second.
-			 */
-			if (intr_storm_threshold != 0 && count >=
-			    intr_storm_threshold) {
-				if (!warned) {
-					printf(
-	"Interrupt storm detected on \"%s\"; throttling interrupt source\n",
-					    p->p_comm);
-					warned = 1;
-				}
-				tsleep(&count, td->td_priority, "istorm",
-				    hz / 10);
-				count = 0;
-			} else
-				count++;
-
 restart:
 			TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
_at__at_ -576,6 +559,53 _at__at_
 					mtx_unlock(&Giant);
 			}
-			if (ithd->it_enable != NULL)
+			if (ithd->it_enable != NULL) {
 				ithd->it_enable(ithd->it_vector);
+
+				/*
+				 * Storm detection needs a delay here
+				 * to see slightly delayed interrupts
+				 * on some machines, but we don't
+				 * want to always delay, so only delay
+				 * while warming up.
+				 */
+				if (warming != 0) {
+					DELAY(1);
+					--warming;
+				}
+			}
+
+			/*
+			 * If we detect an interrupt storm, sleep until
+			 * the next hardclock tick.  We sleep at the
+			 * end of the loop instead of at the beginning
+			 * to ensure that we see slightly delayed
+			 * interrupts.
+			 */
+			if (count >= intr_storm_threshold) {
+				if (!warned) {
+					printf(
+	"Interrupt storm detected on \"%s\"; throttling interrupt source\n",
+					    p->p_comm);
+					warned = 1;
+				}
+				if (cold)
+					Debugger("istorm botch");
+				tsleep(&count, td->td_priority, "istorm", 1);
+
+				/*
+				 * Fudge the count to re-throttle if the
+				 * interrupt is still active.  Our storm
+				 * detection is too primitive to detect
+				 * whether the storm has gone away
+				 * reliably, even if we were to waste a
+				 * lot of time spinning for the next
+				 * intr_storm_threshold interrupts, so
+				 * we assume that the storm hasn't gone
+				 * away unless the interrupt repeats
+				 * less often the hardclock interrupt.
+				 */
+				count = INT_MAX - 1;
+			}
+			count++;
 		}
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
_at__at_ -590,5 +620,4 _at__at_
 		if (!ithd->it_need) {
 			TD_SET_IWAIT(td);
-			count = 0;
 			CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);
 			mi_switch(SW_VOL);
Index: i386/i386/intr_machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/intr_machdep.c,v
retrieving revision 1.5
diff -u -2 -r1.5 intr_machdep.c
--- i386/i386/intr_machdep.c	4 May 2004 21:02:56 -0000	1.5
+++ i386/i386/intr_machdep.c	5 May 2004 19:55:30 -0000
_at__at_ -38,4 +38,5 _at__at_
  */

+#include "opt_apic.h"
 #include "opt_ddb.h"

_at__at_ -211,4 +212,7 _at__at_
 		 */
 		isrc->is_pic->pic_disable_source(isrc);
+#ifdef DEV_APIC
+		DELAY(1);		/* XXX */
+#endif
 		isrc->is_pic->pic_eoi_source(isrc);
 		if (ih == NULL)
%%%
Received on Fri May 14 2004 - 13:32:09 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:54 UTC