Re: kernel trap 19 with interrupts disabled

From: Bruce Evans <bde_at_zeta.org.au>
Date: Thu, 10 Jun 2004 19:59:46 +1000 (EST)
On Wed, 9 Jun 2004, Don Bowman wrote:

> I have a machine which is completely locking
> up solid every day or so. Its been doing this
> for a couple of months on current. It is running
> cvs current from ~2weeks ago.
>
> This time, i tried shorting the NMI out, and I
> got this message to the serial console:
>
> kernel trap 19 with interrupts disabled
> NMI ... going to debugger
>
> ... but I still can't get into the debugger
> with the key sequence, and no additional
> output came out.
>
> Can I assume from the 'with interrupts disabled'
> that it means that all interrupts are locked off?
> or that 'sti' is set? Its a MP system, a dual
> xeon (P4).

It means that the NMI was serviced by a CPU that has interrupts disabled
at the CPU level.  The message for this is a little spurious because NMI
by definition is supposed to be able to occur when interrupts are masked,
but it can be useful to know when an unexpected or even an expected trap
occurs with interrupts disabled.  Here it tells us that the interrupt
may really have needed to be non-maskable to break into some code that is
looping with interrupts disabled.

If you didn't get a debugger prompt after the message, then the debugger
is probably looping too.  It tries to wait for all the other CPUs to stop,
but this will hang if all the other CPUs are looping with interrupts
disabled too.  Try the following hack to get further.

%%%
Index: db_interface.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v
retrieving revision 1.81
diff -u -2 -r1.81 db_interface.c
--- db_interface.c	3 Apr 2004 22:23:36 -0000	1.81
+++ db_interface.c	4 Apr 2004 05:37:38 -0000
_at__at_ -35,4 +35,5 _at__at_
 #include <sys/reboot.h>
 #include <sys/cons.h>
+#include <sys/ktr.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
_at__at_ -41,4 +42,5 _at__at_
 #include <machine/cpu.h>
 #ifdef SMP
+#include <machine/smp.h>
 #include <machine/smptests.h>	/** CPUSTOP_ON_DDBBREAK */
 #endif
_at__at_ -61,4 +63,33 _at__at_
 static jmp_buf	db_global_jmpbuf;

+#ifdef SMP
+/* XXX this is cloned from stop_cpus() since that function can hang. */
+static int
+attempt_to_stop_cpus(u_int map)
+{
+	int i;
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map);
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, IPI_STOP);
+
+	i = 0;
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+		/* spin */
+		i++;
+		if (i == 100000000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+	}
+
+	return 1;
+}
+#endif /* SMP */
+
 /*
  *  kdb_trap - field a TRACE or BPT trap
_at__at_ -69,4 +100,8 _at__at_
 	u_int ef;
 	volatile int ddb_mode = !(boothowto & RB_GDB);
+#ifdef SMP
+	static u_int kdb_trap_lock = NOCPU;
+	static u_int output_lock;
+#endif

 	/*
_at__at_ -91,16 +126,48 _at__at_

 #ifdef SMP
+	if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 &&
+	    kdb_trap_lock != PCPU_GET(cpuid)) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
+		db_printf(
+		    "concurrent ddb entry: type %d trap, code=%x cpu=%d\n",
+		    type, code, PCPU_GET(cpuid));
+		atomic_store_rel_int(&output_lock, 0);
+		if (type == T_BPTFLT)
+			regs->tf_eip--;
+		else {
+			while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+				;
+			db_printf(
+"concurrent ddb entry on non-breakpoint: too hard to handle properly\n");
+			atomic_store_rel_int(&output_lock, 0);
+		}
+		while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU)
+			;
+		write_eflags(ef);
+		return (1);
+	}
+#endif
+
+#ifdef SMP
 #ifdef CPUSTOP_ON_DDBBREAK
+#define VERBOSE_CPUSTOP_ON_DDBBREAK_NOT

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    PCPU_GET(other_cpus));
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* We stop all CPUs except ourselves (obviously) */
-	stop_cpus(PCPU_GET(other_cpus));
+	attempt_to_stop_cpus(PCPU_GET(other_cpus));

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" stopped.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

_at__at_ -192,18 +259,29 _at__at_

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    stopped_cpus);
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* Restart all the CPUs we previously stopped */
 	if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
 		db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n",
 			  PCPU_GET(other_cpus), stopped_cpus);
+		atomic_store_rel_int(&output_lock, 0);
+#if 0
 		panic("stop_cpus() failed");
+#endif
 	}
 	restart_cpus(stopped_cpus);

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" restarted.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

_at__at_ -211,4 +289,8 _at__at_
 #endif /* SMP */

+#ifdef SMP
+	atomic_store_rel_int(&kdb_trap_lock, NOCPU);
+#endif
+
 	write_eflags(ef);

%%%

Bruce
Received on Thu Jun 10 2004 - 08:00:16 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:56 UTC