Re: [RFC/RFT] calloutng

From: Luigi Rizzo <rizzo_at_iet.unipi.it>
Date: Sun, 17 Feb 2013 03:42:20 +0100
On Thu, Feb 14, 2013 at 11:56:54PM -0800, Alfred Perlstein wrote:
> [ added Luigi Rizzo to thread ]
> 
> 
> On 2/11/13 12:26 PM, Davide Italiano wrote:
> > [trimmed old mails]
> >
> > Here's a new version of the patch:
> > http://people.freebsd.org/~davide/patches/calloutng-11022012.diff
> >
> > Significant bits changed (after wider discussion and suggestion by phk_at_):
> > - Introduction of the new sbintime_t type (32.32 fixed point) with the
> > respective conversion (sbintime2bintime, sbintime2timeval etc...)
> > - switch from 64.64 (struct bintime) format to measure time to sbintime_t
> > - Use sbintime_t to represent expected sleep time instead of measuring
> > it in microseconds (cpu_idle_acpi(), cpu_idle_spin() ...).
> 
> 
> Luigi and I discussed this at BAFUG tonight and he expressed an interest 
> in shepherding the code in at this point.
> 
> Luigi, can you reiterate any points that still remain before we 
> integrate this code?

I am mostly ok with the patch in the current state.
I have few comments/suggestions below but they are mostly
on documenting/style/trivial bugfixes.

So now I would really like to see this code go in HEAD,
to give people a chance to see how it works and possibly
figure out whether the new API needs modifications.

To recall, my major concerns were the following:

- API explosion, with multiple versions of the callout routines.

  This seems to be solved now, with only one version (the *_sbt()
  functions) and macros remapping the old functions to the new ones.

- the use of struct bintime for timeouts and precision.

  This is also solved thanks to the introduction of sbintime_t values
  (fixed-point 32.32 times)

- Some measurements also indicated that the default "precision"
  could give large (absolute) errors on the timeouts, especially
  with large timeouts.

  I am not sure what is the situation with this new version, but i believe
  this a relatively minor issue because it surely simple to customize, e.g.
  having a couple of sysctl setting the default precision (percentage)
  and maximum error (absolute time). So users could e.g. set
  a 5% precision and a maximum error of 100us on a desktop,
  and 10% and 10ms on a laptop.

- Another thing that we should do (but after the patch is in) is to
  see if any existing code is converting times to ticks just to
  call the timeout routines -- right now the macros convert back
  to times, clearly it would be best to avoid the double conversion.

comments inline below, search for XXX-lr

thanks again for your work on this, and for following up with changes
after the previous discussion

cheers
luigi
  
Index: sys/dev/acpica/acpi_cpu.c
===================================================================
--- sys/dev/acpica/acpi_cpu.c	(.../head)	(revision 246685)
+++ sys/dev/acpica/acpi_cpu.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -980,13 +980,16 _at__at_
     }
 
     /* Find the lowest state that has small enough latency. */
+    us = sc->cpu_prev_sleep;
+    if (sbt >= 0 && us > sbt / SBT_1US)	XXX-lr can we have us2sbt() , ms2sbt() and the like ?
+	us = sbt / SBT_1US;
     cx_next_idx = 0;
     if (cpu_disable_deep_sleep)
 	i = min(sc->cpu_cx_lowest, sc->cpu_non_c3);
     else
 	i = sc->cpu_cx_lowest;
     for (; i >= 0; i--) {
-	if (sc->cpu_cx_states[i].trans_lat * 3 <= sc->cpu_prev_sleep) {
+	if (sc->cpu_cx_states[i].trans_lat * 3 <= us) {
 	    cx_next_idx = i;
 	    break;
 	}
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c	(.../head)	(revision 246685)
+++ sys/kern/kern_synch.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -146,12 +146,12 _at__at_
  */
 int
 _sleep(void *ident, struct lock_object *lock, int priority,
-    const char *wmesg, int timo)
+    const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 {
 	struct thread *td;
 	struct proc *p;
 	struct lock_class *class;
-	int catch, flags, lock_state, pri, rval;
+	int catch, sleepq_flags, lock_state, pri, rval;		XXX-lr keep flags, use _sleep_flags as function argument ?
 	WITNESS_SAVE_DECL(lock_witness);
 
 	td = curthread;
_at__at_ -348,28 +349,30 _at__at_
  * to a "timo" value of one.
  */
 int
-pause(const char *wmesg, int timo)
+pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 {
-	KASSERT(timo >= 0, ("pause: timo must be >= 0"));
+	int sbt_sec;						XXX-lr localize to the cold block
 
+	sbt_sec = sbintime_getsec(sbt);	
+	KASSERT(sbt_sec >= 0, ("pause: timo must be >= 0"));
+
 	/* silently convert invalid timeouts */
-	if (timo < 1)
-		timo = 1;
+	if (sbt == 0)
+		sbt = tick_sbt;
 
 	if (cold) {
 		/*
-		 * We delay one HZ at a time to avoid overflowing the
+		 * We delay one second at a time to avoid overflowing the
 		 * system specific DELAY() function(s):
 		 */
-		while (timo >= hz) {
+		while (sbt_sec > 0) {
 			DELAY(1000000);
-			timo -= hz;
+			sbt_sec--;
 		}
-		if (timo > 0)
-			DELAY(timo * tick);
+		DELAY((sbt & 0xffffffff) / SBT_1US);	XXX-lr sbt2us() ?
 		return (0);
 	}
-	return (tsleep(&pause_wchan, 0, wmesg, timo));
+	return (_sleep(&pause_wchan, NULL, 0, wmesg, sbt, pr, flags));
 }
 
 /*
_at__at_ -560,8 +563,9 _at__at_
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
-	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
-	    loadav, NULL);
+	callout_reset_sbt(&loadav_callout,				XXX-lr we have better resolution than HZ here ?
+	    tick_sbt * (hz * 4 + (int)(random() % (hz * 2 + 1))), 0,
+	    loadav, NULL, C_DIRECT_EXEC | C_HARDCLOCK);
 }
 
 /* ARGSUSED */
Index: sys/kern/sys_generic.c
===================================================================
--- sys/kern/sys_generic.c	(.../head)	(revision 246685)
+++ sys/kern/sys_generic.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -995,35 +996,29 _at__at_
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
+	precision = 0;
 	if (tvp != NULL) {
-		atv = *tvp;
-		if (itimerfix(&atv)) {
+		rtv = *tvp;
+		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||	XXX-lr itimerfix or some check routine ? several places
+		    rtv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
-	}
-	timo = 0;
+		rsbt = timeval2sbintime(rtv);
+		precision = rsbt;
+		precision >>= tc_precexp;
+		if (TIMESEL(&asbt, rsbt))
+			asbt += tc_tick_sbt;
+		asbt += rsbt; 
+	} else
+		asbt = -1;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=))
-				break;
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
-		}
-		error = seltdwait(td, timo);
+		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
Index: sys/kern/kern_tc.c
===================================================================
--- sys/kern/kern_tc.c	(.../head)	(revision 246685)
+++ sys/kern/kern_tc.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -119,6 +120,21 _at__at_
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
     &timestepwarnings, 0, "Log time steps");
 
+struct bintime bt_timethreshold;		XXX-lr document these variables
+struct bintime bt_tickthreshold;
+sbintime_t sbt_timethreshold;
+sbintime_t sbt_tickthreshold;
+struct bintime tc_tick_bt;
+sbintime_t tc_tick_sbt;
+int tc_precexp;
+int tc_timepercentage = TC_DEFAULTPERC;
+TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage);
+static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_kern_timecounter_adjprecision, "I",
+    "Allowed time interval deviation in percents");
+
 static void tc_windup(void);
 static void cpu_tick_calibrate(int);
 
_at__at_ -883,6 +919,16 _at__at_
 }
 
 void
+sbinuptime(sbintime_t sbt)	XXX-lr wrong argument ? not compile-tested ?
Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c	(.../head)	(revision 246685)
+++ sys/kern/kern_timeout.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -87,58 +105,62 _at__at_
 int callwheelsize, callwheelmask;
 
 /*
- * The callout cpu migration entity represents informations necessary for
- * describing the migrating callout to the new callout cpu.
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
-struct cc_mig_ent {
+struct cc_exec {
+	struct callout		*cc_next;
+	struct callout		*cc_curr;
 #ifdef SMP
-	void	(*ce_migration_func)(void *);
-	void	*ce_migration_arg;
-	int	ce_migration_cpu;
-	int	ce_migration_ticks;
+	void			(*ce_migration_func)(void *);
+	void			*ce_migration_arg;
+	int			ce_migration_cpu;
+	sbintime_t		ce_migration_time;
 #endif
+	int			cc_cancel;
+	int			cc_waiting;
 };
-	
+
 /*
- * There is one struct callout_cpu per cpu, holding all relevant
+ * There is one struct callou_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
- * In particular:
- *	cc_ticks is incremented once per tick in callout_cpu().
- *	It tracks the global 'ticks' but in a way that the individual
- *	threads should not worry about races in the order in which
- *	hardclock() and hardclock_cpu() run on the various CPUs.
- *	cc_softclock is advanced in callout_cpu() to point to the
- *	first entry in cc_callwheel that may need handling. In turn,
- *	a softclock() is scheduled so it can serve the various entries i
- *	such that cc_softclock <= i <= cc_ticks .
- *	XXX maybe cc_softclock and cc_ticks should be volatile ?
- *
- *	cc_ticks is also used in callout_reset_cpu() to determine
- *	when the callout should be served.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
-	struct cc_mig_ent	cc_migrating_entity;
+	struct cc_exec 		cc_exec_entity[2];	XXX-lr repeat (short) explanation on why 2 entities
 	struct callout		*cc_callout;
 	struct callout_tailq	*cc_callwheel;
+	struct callout_tailq	cc_expireq;
 	struct callout_list	cc_callfree;
-	struct callout		*cc_next;
-	struct callout		*cc_curr;
+	sbintime_t		cc_firstevent;
+	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
-	int 			cc_ticks;
-	int 			cc_softticks;
-	int			cc_cancel;
-	int			cc_waiting;
-	int 			cc_firsttick;
 };
 
+#define	cc_exec_curr		cc_exec_entity[0].cc_curr
+#define	cc_exec_next		cc_exec_entity[0].cc_next
+#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
+#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
+#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
+#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
+#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
+#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
+
 #ifdef SMP
-#define	cc_migration_func	cc_migrating_entity.ce_migration_func
-#define	cc_migration_arg	cc_migrating_entity.ce_migration_arg
-#define	cc_migration_cpu	cc_migrating_entity.ce_migration_cpu
-#define	cc_migration_ticks	cc_migrating_entity.ce_migration_ticks
+#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
+#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
+#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
+#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
+#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
+#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
+#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
+#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
Index: sys/ia64/ia64/machdep.c
===================================================================
--- sys/ia64/ia64/machdep.c	(.../head)	(revision 246685)
+++ sys/ia64/ia64/machdep.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -404,7 +405,7 _at__at_
 	if (sched_runnable())
 		ia64_enable_intr();
 	else if (cpu_idle_hook != NULL) {
-		(*cpu_idle_hook)();		XXX-lr style ? just use cpu_idle_hook(sbt);
+		(*cpu_idle_hook)(sbt);
 		/* The hook must enable interrupts! */
 	} else {
 		ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
Index: sys/ofed/include/linux/timer.h
===================================================================
--- sys/ofed/include/linux/timer.h	(.../head)	(revision 246685)
+++ sys/ofed/include/linux/timer.h	(.../projects/calloutng)	(revision 246685)
_at__at_ -65,13 +64,16 _at__at_
 	callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE);		\
 } while (0)
 
-#define	mod_timer(timer, expire)					\	XXX-lr gratuitous rename
-	callout_reset(&(timer)->timer_callout, (expire) - jiffies,	\
-	    _timer_fn, (timer))
+#define	mod_timer(timer, exp)						\
+do {									\
+	(timer)->expires = exp;						\
+	callout_reset(&(timer)->timer_callout, (exp) - jiffies,		\
+	    _timer_fn, (timer));					\
+} while (0)
 
 #define	add_timer(timer)						\
 	callout_reset(&(timer)->timer_callout,				\
-	    (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
+	    (timer)->expires - jiffies, _timer_fn, (timer))
 
 #define	del_timer(timer)	callout_stop(&(timer)->timer_callout)
 #define	del_timer_sync(timer)	callout_drain(&(timer)->timer_callout)
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h	(.../head)	(revision 246685)
+++ sys/sys/callout.h	(.../projects/calloutng)	(revision 246685)
_at__at_ -47,7 +47,17 _at__at_
 #define	CALLOUT_RETURNUNLOCKED	0x0010 /* handler returns with mtx unlocked */
 #define	CALLOUT_SHAREDLOCK	0x0020 /* callout lock held in shared mode */
 #define	CALLOUT_DFRMIGRATION	0x0040 /* callout in deferred migration mode */
+#define	CALLOUT_PROCESSED	0x0080 /* callout in wheel or processing list? */
+#define	CALLOUT_DIRECT 		0x0100 /* allow exec from hw int context */
 
+#define	C_DIRECT_EXEC		0x0001 /* direct execution of callout */
+#define	C_PRELBITS		7						XXX-lr document all
+#define	C_PRELRANGE		((1 << C_PRELBITS) - 1)
+#define	C_PREL(x)		(((x) + 1) << 1)
+#define	C_PRELGET(x)		(int)((((x) >> 1) & C_PRELRANGE) - 1)
+#define	C_HARDCLOCK		0x0100 /* align to hardclock() calls */
+#define	C_ABSOLUTE		0x0200 /* event time is absolute. */
+
 struct callout_handle {
 	struct callout *callout;
 };
Index: sys/sys/time.h
===================================================================
--- sys/sys/time.h	(.../head)	(revision 246685)
+++ sys/sys/time.h	(.../projects/calloutng)	(revision 246685)
_at__at_ -109,6 +124,45 _at__at_
 	    ((a)->frac cmp (b)->frac) :					\
 	    ((a)->sec cmp (b)->sec))
 
+typedef int64_t sbintime_t;				XXX-lr add function ns2sbt(), us2sbt(), ms2sbt()
+#define	SBT_1S	((sbintime_t)1 << 32)
+#define	SBT_1M	(SBT_1S * 60)
+#define	SBT_1MS	(SBT_1S / 1000)
+#define	SBT_1US	(SBT_1S / 1000000)
+#define	SBT_1NS	(SBT_1S / 1000000000)
+
+static __inline int
+sbintime_getsec(sbintime_t sbt)
+{
+	
+	return (int)(sbt >> 32);
+}
+
+static __inline sbintime_t
+bintime2sbintime(const struct bintime bt)
+{
+
+	return (((sbintime_t)bt.sec << 32) + (bt.frac >> 32));
+}
+
+static __inline struct bintime 
+sbintime2bintime(sbintime_t sbt)
+{
+	struct bintime bt;
+
+	bt.sec = sbt >> 32;
+	bt.frac = sbt << 32;
+	return (bt);
+
+}
+
+#ifdef _KERNEL
+
+extern struct bintime tick_bt;
+extern sbintime_t tick_sbt;
+
+#endif /* KERNEL */ 
+
 /*-
  * Background information:
  *
_at__at_ -290,7 +381,15 _at__at_
 extern volatile time_t	time_second;
 extern volatile time_t	time_uptime;
 extern struct bintime boottimebin;
+extern struct bintime tc_tick_bt;
+extern sbintime_t tc_tick_sbt;				XXX-lr comment the new vars
 extern struct timeval boottime;
+extern int tc_precexp;					XXX-lr comment
+extern int tc_timepercentage;
+extern struct bintime bt_timethreshold;
+extern struct bintime bt_tickthreshold;
+extern sbintime_t sbt_timethreshold;
+extern sbintime_t sbt_tickthreshold;
 
 /*
  * Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
_at__at_ -337,6 +438,23 _at__at_
 void	timevaladd(struct timeval *t1, const struct timeval *t2);
 void	timevalsub(struct timeval *t1, const struct timeval *t2);
 int	tvtohz(struct timeval *tv);
+
+#define	TC_DEFAULTPERC		5	XXX-lr comment
+
+#define	BT2FREQ(bt)                                                     \
+	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /           \
+	    ((bt)->frac >> 1))
+
+#define	FREQ2BT(freq, bt)                                               \
+{									\
+	(bt)->sec = 0;                                                  \
+	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;     \
+}
+
+#define	TIMESEL(sbt, sbt2)						\
+	(((sbt2) >= sbt_timethreshold) ?				\
+	    (getsbinuptime(sbt), 1) : (sbinuptime(sbt), 0))
+
 #else /* !_KERNEL */
 #include <time.h>
 
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c	(.../head)	(revision 246685)
+++ sys/netinet/tcp_timer.c	(.../projects/calloutng)	(revision 246685)
_at__at_ -719,20 +719,24 _at__at_
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
 void
-tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
+    struct xtcp_timer *xtimer)
 {
-	bzero(xtimer, sizeof(struct xtcp_timer));
+	sbintime_t now;
+
+	bzero(xtimer, sizeof(*xtimer));				XXX-lr use sbd2ms()
 	if (timer == NULL)
 		return;
+	getsbinuptime(&now);
 	if (callout_active(&timer->tt_delack))
-		xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
+		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_rexmt))
-		xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
+		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_persist))
-		xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
+		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_keep))
-		xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
+		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_2msl))
-		xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
Received on Sun Feb 17 2013 - 01:42:08 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:34 UTC