Re: Panic with mca trap

From: John Baldwin <jhb_at_freebsd.org>
Date: Thu, 3 Feb 2011 10:09:52 -0500
On Thursday, February 03, 2011 8:05:31 am John Baldwin wrote:
> On Tuesday, February 01, 2011 11:58:12 am mdf_at_freebsd.org wrote:
> > On a piece of hardware trying to verify basic build tests, we got an
> > MCA exception that then panic'd the kernel due to WITNESS/INVARIANTS
> > interaction.
> > 
> > panic _at_ time 1296563157.510, thread 0xffffff0005540000: blockable
> > sleep lock (sleep mutex) 128 _at_ /build/mnt/src/sys/vm/uma_core.c:1872
> > 
> > Stack: --------------------------------------------------
> > kernel:witness_checkorder+0x7a2
> > kernel:_mtx_lock_flags+0x81
> > kernel:uma_zalloc_arg+0x256
> > kernel:malloc+0xc5
> > kernel:mca_record_entry+0x30
> > kernel:mca_scan+0xc9
> > kernel:mca_intr+0x79
> > kernel:trap+0x30b
> > kernel:witness_checkorder+0x66
> > kernel:_mtx_lock_spin_flags+0xa4
> > kernel:witness_checkorder+0x2a8
> > kernel:_mtx_lock_spin_flags+0xa4
> > kernel:tdq_idled+0xe8
> > kernel:sched_idletd+0x5b
> > kernel:fork_exit+0x9b
> > 
> > That's this bit of code in uma_zalloc_arg():
> > 
> > #ifdef INVARIANTS
> >                         ZONE_LOCK(zone);
> >                         uma_dbg_alloc(zone, NULL, item);
> >                         ZONE_UNLOCK(zone);
> > #endif
> > 
> > 
> > I don't know uma(9) well enough to know the best workaround.  Clearly
> > there are times we can be in uma_zalloc_arg() and taking a regular
> > mutex is not acceptable.  But what to do for the uma_dbg_free() call
> > so it's happy, and whether to guard taking the ZONE lock with M_NOWAIT
> > or td_critnest > 0 or both is outside my current knowledge.
> > 
> > I don't expect we'll see this panic again any time soon, but it would
> > be nice to fix the story for WITNESS of when an M_NOWAIT allocation
> > can be done.
> 
> Actually, this is more my fault.  The machine check happened while the 
> interrupted thread was already in a critical section (hence the WITNESS 
> complaint).  However, it really isn't correct to be calling malloc() from an 
> arbitrary exception handler, especially one like MC# which can fire pretty 
> much anywhere.  I think instead that we should use malloc() when polling the 
> machine check banks, but keep a pre-allocated pool of structures for use with 
> MC# exceptions and CMC interrupts and replenish the pool via an asynchronous 
> task.

This is what I'm thinking:

Index: mca.c
===================================================================
--- mca.c	(revision 218221)
+++ mca.c	(working copy)
_at__at_ -85,6 +85,7 _at__at_
 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
 
 static int mca_count;		/* Number of records stored. */
+static int mca_banks;		/* Number of per-CPU register banks. */
 
 SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
 
_at__at_ -102,6 +103,8 _at__at_
 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 
+static STAILQ_HEAD(, mca_internal) mca_freelist;
+static int mca_freecount;
 static STAILQ_HEAD(, mca_internal) mca_records;
 static struct callout mca_timer;
 static int mca_ticks = 3600;	/* Check hourly by default. */
_at__at_ -111,7 +114,6 _at__at_
 
 #ifdef DEV_APIC
 static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
-static int cmc_banks;
 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
 #endif
 
_at__at_ -415,21 +417,51 _at__at_
 	return (1);
 }
 
-static void __nonnull(1)
-mca_record_entry(const struct mca_record *record)
+static void
+mca_fill_freelist(void)
 {
 	struct mca_internal *rec;
+	int desired;
 
-	rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
-	if (rec == NULL) {
-		printf("MCA: Unable to allocate space for an event.\n");
-		mca_log(record);
-		return;
+	/*
+	 * Ensure we have at least one record for each bank and one
+	 * record per CPU.
+	 */
+	desired = imax(mp_ncpus, mca_banks);
+	mtx_lock_spin(&mca_lock);
+	while (desired < mca_freecount) {
+		mtx_unlock_spin(&mca_lock);
+		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
+		mtx_lock_spin(&mca_lock);
+		STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
+		mca_freecount++;
 	}
+	mtx_unlock_spin(&mca_lock);
+}
 
+static void __nonnull(2)
+mca_record_entry(enum scan_mode mode, const struct mca_record *record)
+{
+	struct mca_internal *rec;
+
+	if (mode == POLLED) {
+		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
+		mtx_lock_spin(&mca_lock);
+	} else {
+		mtx_lock_spin(&mca_lock);
+		rec = STAILQ_FIRST(&mca_freelist);
+		if (rec == NULL) {
+			mtx_unlock_spin(&mca_lock);
+			printf("MCA: Unable to allocate space for an event.\n");
+			mca_log(record);
+			return;
+		}
+		STAILQ_REMOVE_HEAD(&mca_freelist, link);
+		mca_freecount--;
+	}
+
 	rec->rec = *record;
 	rec->logged = 0;
-	mtx_lock_spin(&mca_lock);
 	STAILQ_INSERT_TAIL(&mca_records, rec, link);
 	mca_count++;
 	mtx_unlock_spin(&mca_lock);
_at__at_ -551,7 +583,7 _at__at_
 				recoverable = 0;
 				mca_log(&rec);
 			}
-			mca_record_entry(&rec);
+			mca_record_entry(mode, &rec);
 		}
 	
 #ifdef DEV_APIC
_at__at_ -563,6 +595,8 _at__at_
 			cmci_update(mode, i, valid, &rec);
 #endif
 	}
+	if (mode == POLLED)
+		mca_fill_freelist();
 	return (mode == MCE ? recoverable : count);
 }
 
_at__at_ -642,15 +676,14 _at__at_
 
 #ifdef DEV_APIC
 static void
-cmci_setup(uint64_t mcg_cap)
+cmci_setup(void)
 {
 	int i;
 
 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
 	    M_MCA, M_WAITOK);
-	cmc_banks = mcg_cap & MCG_CAP_COUNT;
 	for (i = 0; i <= mp_maxid; i++)
-		cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
 		    M_MCA, M_WAITOK | M_ZERO);
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
_at__at_ -672,10 +705,13 _at__at_
 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
 		workaround_erratum383 = 1;
 
+	mca_banks = mcg_cap & MCG_CAP_COUNT;
 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
 	STAILQ_INIT(&mca_records);
 	TASK_INIT(&mca_task, 0, mca_scan_cpus, NULL);
 	callout_init(&mca_timer, CALLOUT_MPSAFE);
+	STAILQ_INIT(&mca_freelist);
+	mca_fill_freelist();
 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "count", CTLFLAG_RD, &mca_count, 0, "Record count");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
_at__at_ -689,7 +725,7 _at__at_
 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
 #ifdef DEV_APIC
 	if (mcg_cap & MCG_CAP_CMCI_P)
-		cmci_setup(mcg_cap);
+		cmci_setup();
 #endif
 }
 
_at__at_ -707,7 +743,7 _at__at_
 	struct cmc_state *cc;
 	uint64_t ctl;
 
-	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	if (ctl & MC_CTL2_CMCI_EN)
_at__at_ -751,7 +787,7 _at__at_
 	struct cmc_state *cc;
 	uint64_t ctl;
 
-	KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/* Ignore banks not monitored by this CPU. */
 	if (!(PCPU_GET(cmci_mask) & 1 << i))

-- 
John Baldwin
Received on Thu Feb 03 2011 - 14:09:54 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:11 UTC