Re: panic on one cpu leaves others running...

From: Bruce Evans <bde_at_zeta.org.au>
Date: Fri, 9 Apr 2004 00:30:58 +1000 (EST)
On Thu, 8 Apr 2004, Robert Watson wrote:

> panic: m 0 so->so_rcv.sb_cc 17
> at line 860 in file ../../../kern/uipc_socket.c
> cpuid = 1;
> Debugger("panic")
> Stopped at      Debugger+0x46:  xchgl   %ebx,in_Debugger.0
> db> trace
> Debugger(c07c3990) at Debugger+0x46
> __panic(c07c98f1,35c,c07c997d,0,11) at __panic+0x13d
> soreceive(c6664618,e9891c0c,e9891c38,0,e9891c10) at soreceive+0x20c
> recvit(c6561e70,3,e9891cc0,0,bfbfe410) at recvit+0x1a2
> recvmsg(c6561e70,e9891d14,3,4,296) at recvmsg+0x9a
> syscall(808002f,bfbf002f,bfbf002f,bfbfe44c,8079a70) at syscall+0x217
> Xint0x80_syscall() at Xint0x80_syscall+0x1d
> --- syscall (27, FreeBSD ELF32, recvmsg), eip = 0x282afff7, esp =
> 0xbfbfe3fc, ebp = 0xbfbfe458 ---
> db> Apr  8 04:09:29  sm-mta[3550]: i3831Ija003419: SYSERR(root): hash map
> "Alias0": missing map file /etc/mail/aliases.db: No such file or directory
> Apr  8 04:09:29  sm-mta[3550]: i3831Ija003419: SYSERR(root): cannot
> flock(/etc/mail/aliases, fd=5, type=1, omode=40000, euid=0): Operation not
> supported
>
> Funky, eh?  I thought we used to have code to ipi the other cpu's and halt
> them until the cpu in ddb was out agian.  I guess I mis-remember, or that
> code is broken...

ddb stops the other CPUs (at least on i386's, unless you have edited
smptests.h to comment out the option CPUSTOP_ON_DDBBREAK which should
be non-optional (always enabled)), but plain panic() doesn't stop them
immediately, so much may happen on other CPUs if ddb is not called
from panic() or if ddb has problems stopping the CPUs.  ddb does have
problems stopping the CPU's, but I don't see how it can reach the db>
prompt before stopping them.  The main problem is that stopping all
the other CPUs may be impossible because one of them is looping with
IPIs disabled, perhaps because it is trying to enter ddb (and stop
other CPUs) too.  All CPUs entering ddb should hang in this case.

Half-baked fixes:

%%%
Index: db_interface.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v
retrieving revision 1.81
diff -u -2 -r1.81 db_interface.c
--- db_interface.c	3 Apr 2004 22:23:36 -0000	1.81
+++ db_interface.c	4 Apr 2004 05:37:38 -0000
_at__at_ -35,4 +35,5 _at__at_
 #include <sys/reboot.h>
 #include <sys/cons.h>
+#include <sys/ktr.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
_at__at_ -41,4 +42,5 _at__at_
 #include <machine/cpu.h>
 #ifdef SMP
+#include <machine/smp.h>
 #include <machine/smptests.h>	/** CPUSTOP_ON_DDBBREAK */
 #endif
_at__at_ -61,4 +63,33 _at__at_
 static jmp_buf	db_global_jmpbuf;

+#ifdef SMP
+/* XXX this is cloned from stop_cpus() since that function can hang. */
+static int
+attempt_to_stop_cpus(u_int map)
+{
+	int i;
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map);
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, IPI_STOP);
+
+	i = 0;
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+		/* spin */
+		i++;
+		if (i == 100000000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+	}
+
+	return 1;
+}
+#endif /* SMP */
+
 /*
  *  kdb_trap - field a TRACE or BPT trap
_at__at_ -69,4 +100,8 _at__at_
 	u_int ef;
 	volatile int ddb_mode = !(boothowto & RB_GDB);
+#ifdef SMP
+	static u_int kdb_trap_lock = NOCPU;
+	static u_int output_lock;
+#endif

 	/*
_at__at_ -91,16 +126,48 _at__at_

 #ifdef SMP
+	if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 &&
+	    kdb_trap_lock != PCPU_GET(cpuid)) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
+		db_printf(
+		    "concurrent ddb entry: type %d trap, code=%x cpu=%d\n",
+		    type, code, PCPU_GET(cpuid));
+		atomic_store_rel_int(&output_lock, 0);
+		if (type == T_BPTFLT)
+			regs->tf_eip--;
+		else {
+			while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+				;
+			db_printf(
+"concurrent ddb entry on non-breakpoint: too hard to handle properly\n");
+			atomic_store_rel_int(&output_lock, 0);
+		}
+		while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU)
+			;
+		write_eflags(ef);
+		return (1);
+	}
+#endif
+
+#ifdef SMP
 #ifdef CPUSTOP_ON_DDBBREAK
+#define VERBOSE_CPUSTOP_ON_DDBBREAK_NOT

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    PCPU_GET(other_cpus));
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* We stop all CPUs except ourselves (obviously) */
-	stop_cpus(PCPU_GET(other_cpus));
+	attempt_to_stop_cpus(PCPU_GET(other_cpus));

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" stopped.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

_at__at_ -192,22 +259,37 _at__at_

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    stopped_cpus);
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* Restart all the CPUs we previously stopped */
 	if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
 		db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n",
 			  PCPU_GET(other_cpus), stopped_cpus);
+		atomic_store_rel_int(&output_lock, 0);
+#if 0
 		panic("stop_cpus() failed");
+#endif
 	}
 	restart_cpus(stopped_cpus);

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" restarted.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 #endif /* CPUSTOP_ON_DDBBREAK */
 #endif /* SMP */
+
+#ifdef SMP
+	atomic_store_rel_int(&kdb_trap_lock, NOCPU);
+#endif

 	write_eflags(ef);
%%%

This is supposed to wait for the other CPUs to either stop or enter
ddb.  They shouldn't loop with interrupts disabled anywhere else.  The
output_lock stuff here is especially half baked.  The
VERBOSE_CPUSTOP_ON_DDBBREAK option should be non-optional (always
disabled), but I needed something to debug concurrent entry and
interleaved output is hard to read.

Bruce
Received on Thu Apr 08 2004 - 05:31:04 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:50 UTC