On Tue, 30 Sep 2003, Sam Leffler wrote: > It reliably locks up for me when you break into a running system; set a > breakpoint; and then continue. Machine is UP+HTT. Haven't tried other > machines. This seems to be because rev.1.75 of db_interface.c disturbed some much larger bugs related to the ones that it fixed. It takes miracles for entering ddb to even sort of work in the SMP case. If multiple CPUs call kdb_trap() concurrently, e.g., by all hitting the same breakpoint, then after 1.75 they first race to stop each other. Before 1.75, they raced to clobber each others registers before this. The race to stop each other cannot be won since all the CPUs have interrupts disabled so they cannot respond to IPIs. It doesn't help that stop_cpus is silent about this. It spins silently forever if a CPU can't be stopped, unless DIAGNOSTIC is configured in which case it gives the plain broken behaviour of warning and returning after not waiting for long enough. But things somehow worked better before 1.75. I don't know exactly why. 1.75 only changes the timing a little, and I would have thought that it reduced the races by giving the other CPUs less time to enter ddb. My tests mainly used a breakpoint at ithread_schedule which is sure to be hit by multiple CPUs quite often, but there wasn't enough interrupt activity for concurrent entry to be the usual case. Debugging printfs affected the races a lot -- turning on VERBOSE_STOP_ON_CPU_BREAK mostly avoided the problem, but with a syscons console it sometimes caused fatal traps in bcopy(). If one of multiple CPUs in kdb_trap() somehow stops the others, then the others face different problems when they restart. They can't just return because debugger traps are not restartable (by just returning). They can't just proceed because the first CPU may changed the state in such a way as to make proceeding in the normal way not work (e.g., it may have deleted a breakpoint). These problems are not correctly or completely fixed in: %%% Index: db_interface.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v retrieving revision 1.75 diff -u -2 -r1.75 db_interface.c --- db_interface.c 7 Sep 2003 13:43:01 -0000 1.75 +++ db_interface.c 7 Oct 2003 14:11:35 -0000 _at__at_ -35,4 +35,5 _at__at_ #include <sys/reboot.h> #include <sys/cons.h> +#include <sys/ktr.h> #include <sys/pcpu.h> #include <sys/proc.h> _at__at_ -41,4 +42,5 _at__at_ #include <machine/cpu.h> #ifdef SMP +#include <machine/smp.h> #include <machine/smptests.h> /** CPUSTOP_ON_DDBBREAK */ #endif _at__at_ -73,4 +75,31 _at__at_ } +/* XXX this is cloned from stop_cpus() since that function can hang. */ +static int +attempt_to_stop_cpus(u_int map) +{ + int i; + + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map); + + /* send the stop IPI to all CPUs in map */ + ipi_selected(map, IPI_STOP); + + i = 0; + while ((atomic_load_acq_int(&stopped_cpus) & map) != map) { + /* spin */ + i++; + if (i == 100000000) { + printf("timeout stopping cpus\n"); + break; + } + } + + return 1; +} + /* * kdb_trap - field a TRACE or BPT trap _at__at_ -81,4 +110,6 _at__at_ u_int ef; volatile int ddb_mode = !(boothowto & RB_GDB); + static u_int kdb_trap_lock = NOCPU; + static u_int output_lock; /* _at__at_ -103,16 +134,48 _at__at_ #ifdef SMP + if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 && + kdb_trap_lock != PCPU_GET(cpuid)) { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; + db_printf( + "concurrent ddb entry: type %d trap, code=%x cpu=%d\n", + type, code, PCPU_GET(cpuid)); + atomic_store_rel_int(&output_lock, 0); + if (type == T_BPTFLT) + regs->tf_eip--; + else { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; + db_printf( +"concurrent ddb entry on non-breakpoint: too hard to handle properly\n"); + atomic_store_rel_int(&output_lock, 0); + } + while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU) + ; + write_eflags(ef); + return (1); + } +#endif + +#ifdef SMP #ifdef CPUSTOP_ON_DDBBREAK +#define VERBOSE_CPUSTOP_ON_DDBBREAK #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid), PCPU_GET(other_cpus)); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ /* We stop all CPUs except ourselves (obviously) */ - stop_cpus(PCPU_GET(other_cpus)); + attempt_to_stop_cpus(PCPU_GET(other_cpus)); #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf(" stopped.\n"); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ _at__at_ -204,22 +267,37 _at__at_ #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid), stopped_cpus); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ /* Restart all the CPUs we previously stopped */ if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) { + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n", PCPU_GET(other_cpus), stopped_cpus); + atomic_store_rel_int(&output_lock, 0); +#if 0 panic("stop_cpus() failed"); +#endif } restart_cpus(stopped_cpus); #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK) + while (atomic_cmpset_int(&output_lock, 0, 1) == 0) + ; db_printf(" restarted.\n"); + atomic_store_rel_int(&output_lock, 0); #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */ #endif /* CPUSTOP_ON_DDBBREAK */ #endif /* SMP */ + +#ifdef SMP + atomic_store_rel_int(&kdb_trap_lock, NOCPU); +#endif write_eflags(ef); %%% This is supposed to stop the other CPUs either in kdb_trap() or normally. The timeouts are hopefully long enough for all the CPUs to stop in 1 of these ways. But it doesn't always work. 1 possible problem is that stop and start IPIs may be delivered out of order, so CPUs stopped in kdb_trap() may end up stopped (since we don't wait for them to see the stop IPI). BruceReceived on Tue Oct 07 2003 - 07:10:31 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:24 UTC