Re: Signals and an exiting thread

From: Kostik Belousov <kostikbel_at_gmail.com>
Date: Mon, 5 Oct 2009 22:21:44 +0300
On Fri, Oct 02, 2009 at 10:12:13PM +0200, Jilles Tjoelker wrote:
> On Thu, Oct 01, 2009 at 03:07:30PM +0300, Kostik Belousov wrote:
> > On Wed, Sep 30, 2009 at 11:02:19AM -0700, Justin Teller wrote:
> > > We're trying to control one process from another process through
> > > signals (I know, revolutionary ;-), and we've found that a signal
> > > occasionally gets lost. šThe process we're signaling is
> > > multi-threaded. šIt looks like the signal is lost when the kernel
> > > decides to post the signal to a thread that is in the process of dying
> > > (calling pthread_exit, etc).
> 
> > > Is this expected behavior that we should just handle, or is it a race
> > > in the kernel that should be/will be/already is fixed?
> 
> > > It may be that a fix is already in current, and I just haven't found
> > > it in my searches through the source code (I'm working off of source
> > > code for an older 8.0 image).  If it is fixed, I'd appreciate a
> > > pointer to the code that fixes it.
> 
> > When thread enters the kernel last time to be killed, it is very much
> > bad idea to allow it to return to usermode to handle directed signal.
> > And, there would always be window between entering the kernel and
> > marking the thread as exiting.
> 
> > Moving the thread-directed signals back to the process queue is hard
> > and there is no way to distinguish which signals were sent to process
> > or to the thread.
> 
> > Possibly, we could clear the thread signal mask while still in user mode.
> > I think it would still leave a very narrow window when a process
> > signal could be directed to the dying thread and not be delivered to
> > usermode; this happens when signal is generated while sigsetmask already
> > entered the kernel, but did not changed the mask yet. This is worked
> > around by rechecking the pending signals after setting the block mask
> > and releasing it if needed.
> 
> SIGKILL cannot be masked. Is it possible that a kill(SIGKILL) is
> assigned to a dying thread and lost?
> 
> (SIGSTOP cannot be masked either, but its processing is done by the
> sending thread, so it should be safe.)
> 
> I suppose that race can also occur in other uses of pthread_sigmask().
> If a thread masks a signal for a while, and that signal is assigned to
> that thread just as it is executing pthread_sigmask(), it will only be
> processed when that thread unblocks or accepts it, even though other
> threads may have the signal unmasked or be in a sigwait() for it.
> Signals sent after pthread_sigmask() has changed the signal mask are
> likely processed sooner because they will be assigned to a different
> thread or left in the process queue.
> 
> POSIX seems to say that signals generated for the process should remain
> queued for the process and should only be assigned to a thread at time
> of delivery.
> 
> This could be implemented by leaving signals in the process queue or by
> tracking for each signal in the thread queue whether it was directed at
> the thread and moving the process signals back at sigmask/thr_exit.
> Either way I am not sure of all the consequences at this time.

I agree that postponing assignment of the thread for signal delivery
till the actual delivery occurs is the proper fix. I tried to cheat
in my previous patch. Below is an experimental change that did very
minimal testing.

diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 0386fc4..abd9714 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
_at__at_ -581,20 +581,11 _at__at_ void
 signotify(struct thread *td)
 {
 	struct proc *p;
-	sigset_t set;
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
-	/*
-	 * If our mask changed we may have to move signal that were
-	 * previously masked by all threads to our sigqueue.
-	 */
-	set = p->p_sigqueue.sq_signals;
-	SIGSETNAND(set, td->td_sigmask);
-	if (! SIGISEMPTY(set))
-		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
_at__at_ -1985,17 +1976,9 _at__at_ tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
-	/*
-	 * If the signal is blocked and not destined for this thread, then
-	 * assign it to the process so that we can find it later in the first
-	 * thread that unblocks it.  Otherwise, assign it to this thread now.
-	 */
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
-		if (SIGISMEMBER(td->td_sigmask, sig))
-			sigqueue = &p->p_sigqueue;
-		else
-			sigqueue = &td->td_sigqueue;
+		sigqueue = &p->p_sigqueue;
 	} else {
 		KASSERT(td->td_proc == p, ("invalid thread"));
 		sigqueue = &td->td_sigqueue;
_at__at_ -2409,8 +2392,9 _at__at_ issignal(struct thread *td, int stop_allowed)
 {
 	struct proc *p;
 	struct sigacts *ps;
+	struct sigqueue *queue;
 	sigset_t sigpending;
-	int sig, prop, newsig;
+	int sig, prop, newsig, signo;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
_at__at_ -2420,6 +2404,7 _at__at_ issignal(struct thread *td, int stop_allowed)
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
+		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
_at__at_ -2440,6 +2425,7 _at__at_ issignal(struct thread *td, int stop_allowed)
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
+			sigqueue_delete(&p->p_sigqueue, sig);
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
_at__at_ -2452,12 +2438,18 _at__at_ issignal(struct thread *td, int stop_allowed)
 
 			if (sig != newsig) {
 				ksiginfo_t ksi;
+
+				queue = &td->td_sigqueue;
 				/*
 				 * clear old signal.
 				 * XXX shrug off debugger, it causes siginfo to
 				 * be thrown away.
 				 */
-				sigqueue_get(&td->td_sigqueue, sig, &ksi);
+				if (sigqueue_get(queue, sig, &ksi) == 0) {
+					queue = &p->p_sigqueue;
+					signo = sigqueue_get(queue, sig, &ksi);
+					KASSERT(signo == sig, ("signo != sig"));
+				}
 
 				/*
 				 * If parent wants us to take the signal,
_at__at_ -2472,7 +2464,7 _at__at_ issignal(struct thread *td, int stop_allowed)
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other signals.
 				 */
-				SIGADDSET(td->td_sigqueue.sq_signals, sig);
+				SIGADDSET(queue->sq_signals, sig);
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
_at__at_ -2567,6 +2559,7 _at__at_ issignal(struct thread *td, int stop_allowed)
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
+		sigqueue_delete(&p->p_sigqueue, sig);
 	}
 	/* NOTREACHED */
 }
_at__at_ -2613,7 +2606,9 _at__at_ postsig(sig)
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
-	sigqueue_get(&td->td_sigqueue, sig, &ksi);
+	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
+	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
+		return;
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 6d60ddb..ccf6479 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
_at__at_ -90,6 +90,7 _at__at_ userret(struct thread *td, struct trapframe *frame)
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             td->td_name);
+#if 0
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
_at__at_ -100,6 +101,7 _at__at_ userret(struct thread *td, struct trapframe *frame)
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 #endif
+#endif
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
_at__at_ -218,7 +220,14 _at__at_ ast(struct trapframe *framep)
 			ktrcsw(0, 1);
 #endif
 	}
-	if (flags & TDF_NEEDSIGCHK) {
+
+	/*
+	 * Check for signals. Unlocked reads of p_pendingcnt or
+	 * p_siglist might cause process-directed signal to be handled
+	 * later.
+	 */
+	if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
+	    !SIGISEMPTY(p->p_siglist)) {
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td, SIG_STOP_ALLOWED)) != 0)
diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h
index 89b40f0..680da40 100644
--- a/sys/sys/signalvar.h
+++ b/sys/sys/signalvar.h
_at__at_ -252,9 +252,10 _at__at_ typedef struct sigqueue {
 
 /* Return nonzero if process p has an unmasked pending signal. */
 #define	SIGPENDING(td)							\
-	(!SIGISEMPTY((td)->td_siglist) &&				\
-	    !sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask))
-
+	((!SIGISEMPTY((td)->td_siglist) &&				\
+	    !sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask)) ||	\
+	 (!SIGISEMPTY((td)->td_proc->p_siglist) &&			\
+	    !sigsetmasked(&(td)->td_proc->p_siglist, &(td)->td_sigmask)))
 /*
  * Return the value of the pseudo-expression ((*set & ~*mask) != 0).  This
  * is an optimized version of SIGISEMPTY() on a temporary variable
diff --git a/tools/regression/sigqueue/sigqtest1/sigqtest1.c b/tools/regression/sigqueue/sigqtest1/sigqtest1.c
index 0f40021..f0201c3 100644
--- a/tools/regression/sigqueue/sigqtest1/sigqtest1.c
+++ b/tools/regression/sigqueue/sigqtest1/sigqtest1.c
_at__at_ -1,12 +1,14 _at__at_
 /* $FreeBSD$ */
-#include <signal.h>
-#include <stdio.h>
 #include <err.h>
 #include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
 
 int received;
 
-void handler(int sig, siginfo_t *si, void *ctx)
+void
+handler(int sig, siginfo_t *si, void *ctx)
 {
 	if (si->si_code != SI_QUEUE)
 		errx(1, "si_code != SI_QUEUE");
_at__at_ -15,7 +17,8 _at__at_ void handler(int sig, siginfo_t *si, void *ctx)
 	received++;
 }
 
-int main()
+int
+main()
 {
 	struct sigaction sa;
 	union sigval val;
diff --git a/tools/regression/sigqueue/sigqtest2/sigqtest2.c b/tools/regression/sigqueue/sigqtest2/sigqtest2.c
index 078ea81..50b579d 100644
--- a/tools/regression/sigqueue/sigqtest2/sigqtest2.c
+++ b/tools/regression/sigqueue/sigqtest2/sigqtest2.c
_at__at_ -1,24 +1,29 _at__at_
 /* $FreeBSD$ */
-#include <signal.h>
-#include <stdio.h>
-#include <err.h>
-#include <errno.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
 
 int stop_received;
 int exit_received;
 int cont_received;
 
-void job_handler(int sig, siginfo_t *si, void *ctx)
+void
+job_handler(int sig, siginfo_t *si, void *ctx)
 {
 	int status;
 	int ret;
 
 	if (si->si_code == CLD_STOPPED) {
+		printf("%d: stop received\n", si->si_pid);
 		stop_received = 1;
 		kill(si->si_pid, SIGCONT);
 	} else if (si->si_code == CLD_EXITED) {
+		printf("%d: exit received\n", si->si_pid);
 		ret = waitpid(si->si_pid, &status, 0);
 		if (ret == -1)
 			errx(1, "waitpid");
_at__at_ -26,11 +31,13 _at__at_ void job_handler(int sig, siginfo_t *si, void *ctx)
 			errx(1, "!WIFEXITED(status)");
 		exit_received = 1;
 	} else if (si->si_code == CLD_CONTINUED) {
+		printf("%d: cont received\n", si->si_pid);
 		cont_received = 1;
 	}
 }
 
-void job_control_test()
+void
+job_control_test(void)
 {
 	struct sigaction sa;
 	pid_t pid;
_at__at_ -43,9 +50,12 _at__at_ void job_control_test()
 	stop_received = 0;
 	cont_received = 0;
 	exit_received = 0;
+	fflush(stdout);
 	pid = fork();
 	if (pid == 0) {
+		printf("child %d\n", getpid());
 		kill(getpid(), SIGSTOP);
+		sleep(2);
 		exit(1);
 	}
 
_at__at_ -60,11 +70,13 _at__at_ void job_control_test()
 	printf("job control test OK.\n");
 }
 
-void rtsig_handler(int sig, siginfo_t *si, void *ctx)
+void
+rtsig_handler(int sig, siginfo_t *si, void *ctx)
 {
 }
 
-int main()
+int
+main()
 {
 	struct sigaction sa;
 	sigset_t set;

Received on Mon Oct 05 2009 - 17:21:51 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:39:56 UTC