Re: calcru: negative time ... followed by freeze

From: Bruce Evans <bde_at_zeta.org.au> Date: Mon, 21 Jun 2004 23:38:06 +1000 (EST) · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:58 UTC

On Mon, 21 Jun 2004, Bruce Evans wrote:

> On Mon, 21 Jun 2004, Don Lewis wrote:
> > It looks like the bug is in the exit code.  I tweaked the printf() in
> > calcru() to print out p_state, p_flag, and p_sflag in addition to the
> > other info.  In all cases, the processes that trigger the printf were
> > zombies, and show up as [running] in ttyinfo() on a uniprocessor box.
>
> There is a PR about this (#52490).  The oops in my test fixes it for bogus
> reasons.  I'm currently adding similar printfs to help figure out what is
> going wrong.
>
> > Jun 21 03:17:03 hairball kernel: calcru: negative time of 179 usec for
> > pid 4543 (sh) p_state=0x2 p_flag=0x2002 p_sflag=0x1
> >
> > load: 0.71  cmd: sh 4543 [running] 0.00u 0.00s 3% 0k
> >
> > p_runtime only gets updated in mi_switch(), and it appears that it never
> > gets updated after the calcru() call in exit1().
>
> That explains the problem.  The calcru() values sets up to date (final)
> values for the components of ptu.  When we look at the process after it
> has become a zombie, we use only p_runtime since the process is not
> running, but p_runtime is stale.

I happened to have fixed this already without really noticing.  (My kernel
doesn't call calcru() in exit1() or in wait1(); it accumulates p_runtime
and tick counts (instead of those values converted to timevals by calcru())
in the child stats, so it has to get the final p_runtime right.)

%%%
Index: kern_exit.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.236
diff -u -2 -r1.236 kern_exit.c
--- kern_exit.c	18 Jun 2004 11:13:49 -0000	1.236
+++ kern_exit.c	21 Jun 2004 13:04:31 -0000
_at__at_ -104,4 +104,5 _at__at_
 exit1(struct thread *td, int rv)
 {
+ 	struct bintime new_switchtime;
 	struct proc *p, *nq, *q;
 	struct tty *tp;
_at__at_ -518,8 +519,14 _at__at_
 	mtx_lock_spin(&sched_lock);
 	critical_exit();
-	cnt.v_swtch++;
-	binuptime(PCPU_PTR(switchtime));
+
+	/* Do the same timestamp bookkeeping that mi_switch() would do. */
+	binuptime(&new_switchtime);
+	bintime_add(&p->p_runtime, &new_switchtime);
+	bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
+	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);

+	cnt.v_swtch++;
+
 	/*
 	 * Allow the scheduler to adjust the priority of the
%%%

I will commit this soon.

Workaround for the main problem:

%%%
Index: kern_resource.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.139
diff -u -2 -r1.139 kern_resource.c
--- kern_resource.c	16 Jun 2004 00:26:29 -0000	1.139
+++ kern_resource.c	21 Jun 2004 12:55:24 -0000
_at__at_ -702,8 +702,10 _at__at_
 	struct timeval *ip;
 {
-	struct bintime bt;
+	struct bintime bt, rt;
 	struct timeval tv;
+	struct thread *td;
 	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
 	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
+	int problemcase;

 	mtx_assert(&sched_lock, MA_OWNED);
_at__at_ -719,22 +721,40 _at__at_
 		tt = 1;
 	}
-	if (p == curthread->td_proc) {
+	rt = p->p_runtime;
+	problemcase = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
 		/*
 		 * Adjust for the current time slice.  This is actually fairly
 		 * important since the error here is on the order of a time
 		 * quantum, which is much greater than the sampling error.
-		 * XXXKSE use a different test due to threads on other
-		 * processors also being 'current'.
 		 */
-		binuptime(&bt);
-		bintime_sub(&bt, PCPU_PTR(switchtime));
-		bintime_add(&bt, &p->p_runtime);
-	} else
-		bt = p->p_runtime;
-	bintime2timeval(&bt, &tv);
+		if (td == curthread) {
+			binuptime(&bt);
+			bintime_sub(&bt, PCPU_PTR(switchtime));
+			bintime_add(&rt, &bt);
+		} else if (TD_IS_RUNNING(td)) {
+			/*
+			 * This case should add the current time less the
+			 * switch time as above, but the switch time is
+			 * inaccessible.  So we might end up with rt too
+			 * small and then the monotonicity check might detect
+			 * the problem.  Just set a flag to avoid warning
+			 * about this known problem.
+			 */
+			problemcase = 1;
+		}
+	}
+	bintime2timeval(&rt, &tv);
 	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 	ptu = p->p_uu + p->p_su + p->p_iu;
-	if (tu < ptu || (int64_t)tu < 0) {
-		printf("calcru: negative time of %jd usec for pid %d (%s)\n",
+	if (tu < ptu) {
+		if (!problemcase)
+			printf(
+"calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
+			    (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
+		tu = ptu;
+	}
+	if ((int64_t)tu < 0) {
+		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ptu;
%%%

Bruce