On Wed, 9 Jun 2004, Don Bowman wrote: > I have a machine which is completely locking > up solid every day or so. Its been doing this > for a couple of months on current. It is running > cvs current from ~2weeks ago. > > This time, i tried shorting the NMI out, and I > got this message to the serial console: > > kernel trap 19 with interrupts disabled > NMI ... going to debugger > > ... but I still can't get into the debugger > with the key sequence, and no additional > output came out. > > Can I assume from the 'with interrupts disabled' > that it means that all interrupts are locked off? > or that 'sti' is set? Its a MP system, a dual > xeon (P4). It means that the NMI was serviced by a CPU that has interrupts disabled at the CPU level. The message for this is a little spurious because NMI by definition is supposed to be able to occur when interrupts are masked, but it can be useful to know when an unexpected or even an expected trap occurs with interrupts disabled. Here it tells us that the interrupt may really have needed to be non-maskable to break into some code that is looping with interrupts disabled. If you didn't get a debugger prompt after the message, then the debugger is probably looping too. It tries to wait for all the other CPUs to stop, but this will hang if all the other CPUs are looping with interrupts disabled too. Try the following hack to get further. %%% Index: db_interface.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v retrieving revision 1.81 diff -u -2 -r1.81 db_interface.c --- db_interface.c 3 Apr 2004 22:23:36 -0000 1.81 +++ db_interface.c 4 Apr 2004 05:37:38 -0000 _at__at_ -35,4 +35,5 _at__at_ #include <sys/reboot.h> #include <sys/cons.h> +#include <sys/ktr.h> #include <sys/pcpu.h> #include <sys/proc.h> _at__at_ -41,4 +42,5 _at__at_ #include <machine/cpu.h> #ifdef SMP +#include <machine/smp.h> #include <machine/smptests.h> /** CPUSTOP_ON_DDBBREAK */ #endif _at__at_ -61,4 +63,33 _at__at_ static jmp_buf db_global_jmpbuf; +#ifdef SMP +/* XXX this is cloned from stop_cpus() since that function can hang. */ +static int +attempt_to_stop_cpus(u_int map) +{ + int i; + + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map); + + /* send the stop IPI to all CPUs in map */ + ipi_selected(map, IPI_STOP); + + i = 0; + while ((atomic_load_acq_int(&stopped_cpus) & map) != map) { + /* spin */ + i++; + if (i == 100000000) { + printf("timeout stopping cpus\n"); + break; + } + } + + return 1; +} +#endif /* SMP */ + /* * kdb_trap - field a TRACE or BPT trap _at__at_ -69,4 +100,8 _at__at_ u_int ef; volatile int ddb_mode = !(boothowto & RB_GDB); +#ifdef SMP + static u_int kdb_trap_lock = NOCPU; + static u_int output_lock; +#endif /* _at__at_ -91,16 +126,48 _at__at_ #ifdef SMP + if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 && + kdb_trap_lock != PCPU_GET(cpuid)) { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; + db_printf( + "concurrent ddb entry: type %d trap, code=%x cpu=%d\n", + type, code, PCPU_GET(cpuid)); + atomic_store_rel_int(&output_lock, 0); + if (type == T_BPTFLT) + regs->tf_eip--; + else { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; + db_printf( +"concurrent ddb entry on non-breakpoint: too hard to handle properly\n"); + atomic_store_rel_int(&output_lock, 0); + } + while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU) + ; + write_eflags(ef); + return (1); + } +#endif + +#ifdef SMP #ifdef CPUSTOP_ON_DDBBREAK +#define VERBOSE_CPUSTOP_ON_DDBBREAK_NOT #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid), PCPU_GET(other_cpus)); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ /* We stop all CPUs except ourselves (obviously) */ - stop_cpus(PCPU_GET(other_cpus)); + attempt_to_stop_cpus(PCPU_GET(other_cpus)); #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf(" stopped.\n"); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ _at__at_ -192,18 +259,29 _at__at_ #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid), stopped_cpus); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ /* Restart all the CPUs we previously stopped */ if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n", PCPU_GET(other_cpus), stopped_cpus); + atomic_store_rel_int(&output_lock, 0); +#if 0 panic("stop_cpus() failed"); +#endif } restart_cpus(stopped_cpus); #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf(" restarted.\n"); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ _at__at_ -211,4 +289,8 _at__at_ #endif /* SMP */ +#ifdef SMP + atomic_store_rel_int(&kdb_trap_lock, NOCPU); +#endif + write_eflags(ef); %%% BruceReceived on Thu Jun 10 2004 - 08:00:16 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:56 UTC