Re: Serial debug broken in recent -CURRENT?

From: Bruce Evans <bde_at_zeta.org.au> Date: Wed, 8 Oct 2003 02:08:55 +1000 (EST) · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:24 UTC

On Tue, 30 Sep 2003, Sam Leffler wrote:

> It reliably locks up for me when you break into a running system; set a
> breakpoint; and then continue.  Machine is UP+HTT.  Haven't tried other
> machines.

This seems to be because rev.1.75 of db_interface.c disturbed some much
larger bugs related to the ones that it fixed.  It takes miracles for
entering ddb to even sort of work in the SMP case.  If multiple CPUs
call kdb_trap() concurrently, e.g., by all hitting the same breakpoint,
then after 1.75 they first race to stop each other.  Before 1.75, they
raced to clobber each others registers before this.  The race to stop
each other cannot be won since all the CPUs have interrupts disabled
so they cannot respond to IPIs.  It doesn't help that stop_cpus is silent
about this.  It spins silently forever if a CPU can't be stopped, unless
DIAGNOSTIC is configured in which case it gives the plain broken behaviour
of warning and returning after not waiting for long enough.  But things
somehow worked better before 1.75.  I don't know exactly why.  1.75 only
changes the timing a little, and I would have thought that it reduced the
races by giving the other CPUs less time to enter ddb.  My tests mainly
used a breakpoint at ithread_schedule which is sure to be hit by multiple
CPUs quite often, but there wasn't enough interrupt activity for concurrent
entry to be the usual case.  Debugging printfs affected the races a lot --
turning on VERBOSE_STOP_ON_CPU_BREAK mostly avoided the problem, but with
a syscons console it sometimes caused fatal traps in bcopy().

If one of multiple CPUs in kdb_trap() somehow stops the others, then the
others face different problems when they restart.  They can't just return
because debugger traps are not restartable (by just returning).  They can't
just proceed because the first CPU may changed the state in such a way as
to make proceeding in the normal way not work (e.g., it may have deleted
a breakpoint).

These problems are not correctly or completely fixed in:

%%%
Index: db_interface.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v
retrieving revision 1.75
diff -u -2 -r1.75 db_interface.c
--- db_interface.c	7 Sep 2003 13:43:01 -0000	1.75
+++ db_interface.c	7 Oct 2003 14:11:35 -0000
_at__at_ -35,4 +35,5 _at__at_
 #include <sys/reboot.h>
 #include <sys/cons.h>
+#include <sys/ktr.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
_at__at_ -41,4 +42,5 _at__at_
 #include <machine/cpu.h>
 #ifdef SMP
+#include <machine/smp.h>
 #include <machine/smptests.h>	/** CPUSTOP_ON_DDBBREAK */
 #endif
_at__at_ -73,4 +75,31 _at__at_
 }

+/* XXX this is cloned from stop_cpus() since that function can hang. */
+static int
+attempt_to_stop_cpus(u_int map)
+{
+	int i;
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map);
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, IPI_STOP);
+
+	i = 0;
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+		/* spin */
+		i++;
+		if (i == 100000000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+	}
+
+	return 1;
+}
+
 /*
  *  kdb_trap - field a TRACE or BPT trap
_at__at_ -81,4 +110,6 _at__at_
 	u_int ef;
 	volatile int ddb_mode = !(boothowto & RB_GDB);
+	static u_int kdb_trap_lock = NOCPU;
+	static u_int output_lock;

 	/*
_at__at_ -103,16 +134,48 _at__at_

 #ifdef SMP
+	if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 &&
+	    kdb_trap_lock != PCPU_GET(cpuid)) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
+		db_printf(
+		    "concurrent ddb entry: type %d trap, code=%x cpu=%d\n",
+		    type, code, PCPU_GET(cpuid));
+		atomic_store_rel_int(&output_lock, 0);
+		if (type == T_BPTFLT)
+			regs->tf_eip--;
+		else {
+			while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+				;
+			db_printf(
+"concurrent ddb entry on non-breakpoint: too hard to handle properly\n");
+			atomic_store_rel_int(&output_lock, 0);
+		}
+		while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU)
+			;
+		write_eflags(ef);
+		return (1);
+	}
+#endif
+
+#ifdef SMP
 #ifdef CPUSTOP_ON_DDBBREAK

+#define VERBOSE_CPUSTOP_ON_DDBBREAK
 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    PCPU_GET(other_cpus));
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* We stop all CPUs except ourselves (obviously) */
-	stop_cpus(PCPU_GET(other_cpus));
+	attempt_to_stop_cpus(PCPU_GET(other_cpus));

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" stopped.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

_at__at_ -204,22 +267,37 _at__at_

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    stopped_cpus);
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* Restart all the CPUs we previously stopped */
 	if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
 		db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n",
 			  PCPU_GET(other_cpus), stopped_cpus);
+		atomic_store_rel_int(&output_lock, 0);
+#if 0
 		panic("stop_cpus() failed");
+#endif
 	}
 	restart_cpus(stopped_cpus);

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" restarted.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 #endif /* CPUSTOP_ON_DDBBREAK */
 #endif /* SMP */
+
+#ifdef SMP
+	atomic_store_rel_int(&kdb_trap_lock, NOCPU);
+#endif

 	write_eflags(ef);
%%%

This is supposed to stop the other CPUs either in kdb_trap() or normally.
The timeouts are hopefully long enough for all the CPUs to stop in 1
of these ways.  But it doesn't always work.  1 possible problem is
that stop and start IPIs may be delivered out of order, so CPUs stopped
in kdb_trap() may end up stopped (since we don't wait for them to see
the stop IPI).

Bruce