On Thu, Feb 3, 2011 at 7:09 AM, John Baldwin <jhb_at_freebsd.org> wrote: > On Thursday, February 03, 2011 8:05:31 am John Baldwin wrote: >> On Tuesday, February 01, 2011 11:58:12 am mdf_at_freebsd.org wrote: >> > On a piece of hardware trying to verify basic build tests, we got an >> > MCA exception that then panic'd the kernel due to WITNESS/INVARIANTS >> > interaction. >> > >> > panic _at_ time 1296563157.510, thread 0xffffff0005540000: blockable >> > sleep lock (sleep mutex) 128 _at_ /build/mnt/src/sys/vm/uma_core.c:1872 >> > >> > Stack: -------------------------------------------------- >> > kernel:witness_checkorder+0x7a2 >> > kernel:_mtx_lock_flags+0x81 >> > kernel:uma_zalloc_arg+0x256 >> > kernel:malloc+0xc5 >> > kernel:mca_record_entry+0x30 >> > kernel:mca_scan+0xc9 >> > kernel:mca_intr+0x79 >> > kernel:trap+0x30b >> > kernel:witness_checkorder+0x66 >> > kernel:_mtx_lock_spin_flags+0xa4 >> > kernel:witness_checkorder+0x2a8 >> > kernel:_mtx_lock_spin_flags+0xa4 >> > kernel:tdq_idled+0xe8 >> > kernel:sched_idletd+0x5b >> > kernel:fork_exit+0x9b >> > >> > That's this bit of code in uma_zalloc_arg(): >> > >> > #ifdef INVARIANTS >> > ZONE_LOCK(zone); >> > uma_dbg_alloc(zone, NULL, item); >> > ZONE_UNLOCK(zone); >> > #endif >> > >> > >> > I don't know uma(9) well enough to know the best workaround. Clearly >> > there are times we can be in uma_zalloc_arg() and taking a regular >> > mutex is not acceptable. But what to do for the uma_dbg_free() call >> > so it's happy, and whether to guard taking the ZONE lock with M_NOWAIT >> > or td_critnest > 0 or both is outside my current knowledge. >> > >> > I don't expect we'll see this panic again any time soon, but it would >> > be nice to fix the story for WITNESS of when an M_NOWAIT allocation >> > can be done. >> >> Actually, this is more my fault. The machine check happened while the >> interrupted thread was already in a critical section (hence the WITNESS >> complaint). However, it really isn't correct to be calling malloc() from an >> arbitrary exception handler, especially one like MC# which can fire pretty >> much anywhere. I think instead that we should use malloc() when polling the >> machine check banks, but keep a pre-allocated pool of structures for use with >> MC# exceptions and CMC interrupts and replenish the pool via an asynchronous >> task. > > This is what I'm thinking: > > Index: mca.c > =================================================================== > --- mca.c (revision 218221) > +++ mca.c (working copy) > _at__at_ -85,6 +85,7 _at__at_ > static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture"); > > static int mca_count; /* Number of records stored. */ > +static int mca_banks; /* Number of per-CPU register banks. */ > > SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture"); > > _at__at_ -102,6 +103,8 _at__at_ > SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0, > "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); > > +static STAILQ_HEAD(, mca_internal) mca_freelist; > +static int mca_freecount; > static STAILQ_HEAD(, mca_internal) mca_records; > static struct callout mca_timer; > static int mca_ticks = 3600; /* Check hourly by default. */ > _at__at_ -111,7 +114,6 _at__at_ > > #ifdef DEV_APIC > static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ > -static int cmc_banks; > static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ > #endif > > _at__at_ -415,21 +417,51 _at__at_ > return (1); > } > > -static void __nonnull(1) > -mca_record_entry(const struct mca_record *record) > +static void > +mca_fill_freelist(void) > { > struct mca_internal *rec; > + int desired; > > - rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT); > - if (rec == NULL) { > - printf("MCA: Unable to allocate space for an event.\n"); > - mca_log(record); > - return; > + /* > + * Ensure we have at least one record for each bank and one > + * record per CPU. > + */ > + desired = imax(mp_ncpus, mca_banks); > + mtx_lock_spin(&mca_lock); > + while (desired < mca_freecount) { > + mtx_unlock_spin(&mca_lock); > + rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); > + mtx_lock_spin(&mca_lock); > + STAILQ_INSERT_TAIL(&mca_freelist, rec, link); > + mca_freecount++; > } > + mtx_unlock_spin(&mca_lock); > +} > > +static void __nonnull(2) > +mca_record_entry(enum scan_mode mode, const struct mca_record *record) > +{ > + struct mca_internal *rec; > + > + if (mode == POLLED) { > + rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); > + mtx_lock_spin(&mca_lock); > + } else { > + mtx_lock_spin(&mca_lock); > + rec = STAILQ_FIRST(&mca_freelist); > + if (rec == NULL) { > + mtx_unlock_spin(&mca_lock); > + printf("MCA: Unable to allocate space for an event.\n"); > + mca_log(record); > + return; > + } > + STAILQ_REMOVE_HEAD(&mca_freelist, link); > + mca_freecount--; > + } > + > rec->rec = *record; > rec->logged = 0; > - mtx_lock_spin(&mca_lock); > STAILQ_INSERT_TAIL(&mca_records, rec, link); > mca_count++; > mtx_unlock_spin(&mca_lock); > _at__at_ -551,7 +583,7 _at__at_ > recoverable = 0; > mca_log(&rec); > } > - mca_record_entry(&rec); > + mca_record_entry(mode, &rec); > } > > #ifdef DEV_APIC > _at__at_ -563,6 +595,8 _at__at_ > cmci_update(mode, i, valid, &rec); > #endif > } > + if (mode == POLLED) > + mca_fill_freelist(); > return (mode == MCE ? recoverable : count); > } > > _at__at_ -642,15 +676,14 _at__at_ > > #ifdef DEV_APIC > static void > -cmci_setup(uint64_t mcg_cap) > +cmci_setup(void) > { > int i; > > cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **), > M_MCA, M_WAITOK); > - cmc_banks = mcg_cap & MCG_CAP_COUNT; > for (i = 0; i <= mp_maxid; i++) > - cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks, > + cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks, > M_MCA, M_WAITOK | M_ZERO); > SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, > "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, > _at__at_ -672,10 +705,13 _at__at_ > CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP) > workaround_erratum383 = 1; > > + mca_banks = mcg_cap & MCG_CAP_COUNT; > mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); > STAILQ_INIT(&mca_records); > TASK_INIT(&mca_task, 0, mca_scan_cpus, NULL); > callout_init(&mca_timer, CALLOUT_MPSAFE); > + STAILQ_INIT(&mca_freelist); > + mca_fill_freelist(); > SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, > "count", CTLFLAG_RD, &mca_count, 0, "Record count"); > SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, > _at__at_ -689,7 +725,7 _at__at_ > sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); > #ifdef DEV_APIC > if (mcg_cap & MCG_CAP_CMCI_P) > - cmci_setup(mcg_cap); > + cmci_setup(); > #endif > } > > _at__at_ -707,7 +743,7 _at__at_ > struct cmc_state *cc; > uint64_t ctl; > > - KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); > + KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); > > ctl = rdmsr(MSR_MC_CTL2(i)); > if (ctl & MC_CTL2_CMCI_EN) > _at__at_ -751,7 +787,7 _at__at_ > struct cmc_state *cc; > uint64_t ctl; > > - KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); > + KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); > > /* Ignore banks not monitored by this CPU. */ > if (!(PCPU_GET(cmci_mask) & 1 << i)) This generally looks right to me. I can't test it locally since we're trying to get a release out the door, and I think we've already replaced the bad CPU on the machine that failed. Thanks, matthewReceived on Thu Feb 03 2011 - 15:39:36 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:11 UTC