Index: sys/kern/kern_fork.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_fork.c,v retrieving revision 1.198 diff -u -r1.198 kern_fork.c --- sys/kern/kern_fork.c 13 May 2003 20:35:59 -0000 1.198 +++ sys/kern/kern_fork.c 18 May 2003 15:01:14 -0000 @@ -139,13 +139,6 @@ /* Don't allow kernel only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); - /* - * Don't allow sharing of file descriptor table unless - * RFTHREAD flag is supplied - */ - if ((uap->flags & (RFPROC | RFTHREAD | RFFDG | RFCFDG)) == - RFPROC) - return(EINVAL); error = fork1(td, uap->flags, 0, &p2); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; @@ -209,6 +202,7 @@ int ok; static int pidchecked = 0; struct filedesc *fd; + struct filedesc_to_leader *fdtol; struct proc *p1 = td->td_proc; struct thread *td2; struct kse *ke2; @@ -419,15 +413,40 @@ /* * Copy filedesc. */ - if (flags & RFCFDG) + if (flags & RFCFDG) { fd = fdinit(td->td_proc->p_fd); - else if (flags & RFFDG) { + fdtol = NULL; + } else if (flags & RFFDG) { FILEDESC_LOCK(p1->p_fd); fd = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p1->p_fd); - } else + fdtol = NULL; + } else { fd = fdshare(p1->p_fd); - + if (p1->p_fdtol == NULL) + p1->p_fdtol = + filedesc_to_leader_alloc(NULL, + NULL, + p1->p_leader); + if ((flags & RFTHREAD) != 0) { + /* + * Shared file descriptor table and + * shared process leaders. + */ + fdtol = p1->p_fdtol; + FILEDESC_LOCK(p1->p_fd); + fdtol->fdl_refcount++; + FILEDESC_UNLOCK(p1->p_fd); + } else { + /* + * Shared file descriptor table, and + * different process leaders + */ + fdtol = filedesc_to_leader_alloc(p1->p_fdtol, + p1->p_fd, + p2); + } + } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, @@ -506,6 +525,7 @@ if (p2->p_textvp) VREF(p2->p_textvp); p2->p_fd = fd; + p2->p_fdtol = fdtol; PROC_UNLOCK(p1); PROC_UNLOCK(p2); Index: sys/kern/kern_descrip.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.196 diff -u -r1.196 kern_descrip.c --- sys/kern/kern_descrip.c 15 May 2003 21:13:08 -0000 1.196 +++ sys/kern/kern_descrip.c 20 May 2003 13:42:01 -0000 @@ -73,6 +73,8 @@ #include static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); +static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader", + "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static uma_zone_t file_zone; @@ -456,6 +458,7 @@ struct file *fp; struct file *delfp; int error, newfd; + int holdleaders; p = td->td_proc; fdp = p->p_fd; @@ -520,6 +523,15 @@ * introducing an ownership race for the slot. */ delfp = fdp->fd_ofiles[new]; + if (delfp != NULL && p->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } else + holdleaders = 0; KASSERT(delfp == NULL || type == DUP_FIXED, ("dup() picked an open file")); #if 0 @@ -546,6 +558,16 @@ mtx_lock(&Giant); (void) closef(delfp, td); mtx_unlock(&Giant); + if (holdleaders) { + FILEDESC_LOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_UNLOCK(fdp); + } } return (0); } @@ -793,9 +815,11 @@ struct filedesc *fdp; struct file *fp; int fd, error; + int holdleaders; fd = uap->fd; error = 0; + holdleaders = 0; fdp = td->td_proc->p_fd; mtx_lock(&Giant); FILEDESC_LOCK(fdp); @@ -811,6 +835,14 @@ #endif fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } /* * we now hold the fp reference that used to be owned by the descriptor @@ -829,6 +861,16 @@ error = closef(fp, td); done2: mtx_unlock(&Giant); + if (holdleaders) { + FILEDESC_LOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_UNLOCK(fdp); + } return (error); } @@ -1383,12 +1425,88 @@ struct filedesc *fdp; struct file **fpp; int i; + struct filedesc_to_leader *fdtol; + struct file *fp; + struct vnode *vp; + struct flock lf; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; + /* Check for special need to clear POSIX style locks */ + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + FILEDESC_LOCK(fdp); + KASSERT(fdtol->fdl_refcount > 0, + ("filedesc_to_refcount botch: fdl_refcount=%d", + fdtol->fdl_refcount)); + if (fdtol->fdl_refcount == 1 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + i = 0; + fpp = fdp->fd_ofiles; + for (i = 0, fpp = fdp->fd_ofiles; + i < fdp->fd_lastfile; + i++, fpp++) { + if (*fpp == NULL || + (*fpp)->f_type != DTYPE_VNODE) + continue; + fp = *fpp; + fhold(fp); + FILEDESC_UNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_data; + (void) VOP_ADVLOCK(vp, + (caddr_t)td->td_proc-> + p_leader, + F_UNLCK, + &lf, + F_POSIX); + FILEDESC_LOCK(fdp); + fdrop(fp, td); + fpp = fdp->fd_ofiles + i; + } + } + retry: + if (fdtol->fdl_refcount == 1) { + if (fdp->fd_holdleaderscount > 0 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + /* + * close() or do_dup() has cleared a reference + * in a shared file descriptor table. + */ + fdp->fd_holdleaderswakeup = 1; + msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx, + PLOCK, "fdlhold", 0); + goto retry; + } + if (fdtol->fdl_holdcount > 0) { + /* + * Ensure that fdtol->fdl_leader + * remains valid in closef(). + */ + fdtol->fdl_wakeup = 1; + msleep(fdtol, &fdp->fd_mtx, + PLOCK, "fdlhold", 0); + goto retry; + } + } + fdtol->fdl_refcount--; + if (fdtol->fdl_refcount == 0 && + fdtol->fdl_holdcount == 0) { + fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; + fdtol->fdl_prev->fdl_next = fdtol->fdl_next; + } else + fdtol = NULL; + td->td_proc->p_fdtol = NULL; + FILEDESC_UNLOCK(fdp); + if (fdtol != NULL) + FREE(fdtol, M_FILEDESC_TO_LEADER); + } FILEDESC_LOCK(fdp); if (--fdp->fd_refcnt > 0) { FILEDESC_UNLOCK(fdp); @@ -1626,6 +1744,8 @@ { struct vnode *vp; struct flock lf; + struct filedesc_to_leader *fdtol; + struct filedesc *fdp; if (fp == NULL) return (0); @@ -1637,15 +1757,51 @@ * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor. */ - if (td != NULL && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0 && + if (td != NULL && fp->f_type == DTYPE_VNODE) { - lf.l_whence = SEEK_SET; - lf.l_start = 0; - lf.l_len = 0; - lf.l_type = F_UNLCK; - vp = fp->f_data; - (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, - F_UNLCK, &lf, F_POSIX); + if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, + F_UNLCK, &lf, F_POSIX); + } + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + /* + * Handle special case where file descriptor table + * is shared between multiple process leaders. + */ + fdp = td->td_proc->p_fd; + FILEDESC_LOCK(fdp); + for (fdtol = fdtol->fdl_next; + fdtol != td->td_proc->p_fdtol; + fdtol = fdtol->fdl_next) { + if ((fdtol->fdl_leader->p_flag & + P_ADVLOCK) == 0) + continue; + fdtol->fdl_holdcount++; + FILEDESC_UNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_data; + (void) VOP_ADVLOCK(vp, + (caddr_t)fdtol->fdl_leader, + F_UNLCK, &lf, F_POSIX); + FILEDESC_LOCK(fdp); + fdtol->fdl_holdcount--; + if (fdtol->fdl_holdcount == 0 && + fdtol->fdl_wakeup != 0) { + fdtol->fdl_wakeup = 0; + wakeup(fdtol); + } + } + FILEDESC_UNLOCK(fdp); + } } return (fdrop(fp, td)); } @@ -2078,6 +2234,36 @@ return (error); } /* NOTREACHED */ +} + + +struct filedesc_to_leader * +filedesc_to_leader_alloc(struct filedesc_to_leader *old, + struct filedesc *fdp, + struct proc *leader) +{ + struct filedesc_to_leader *fdtol; + + MALLOC(fdtol, struct filedesc_to_leader *, + sizeof(struct filedesc_to_leader), + M_FILEDESC_TO_LEADER, + M_WAITOK); + fdtol->fdl_refcount = 1; + fdtol->fdl_holdcount = 0; + fdtol->fdl_wakeup = 0; + fdtol->fdl_leader = leader; + if (old != NULL) { + FILEDESC_LOCK(fdp); + fdtol->fdl_next = old->fdl_next; + fdtol->fdl_prev = old; + old->fdl_next = fdtol; + fdtol->fdl_next->fdl_prev = fdtol; + FILEDESC_UNLOCK(fdp); + } else { + fdtol->fdl_next = fdtol; + fdtol->fdl_prev = fdtol; + } + return fdtol; } /* Index: sys/kern/init_main.c =================================================================== RCS file: /home/ncvs/src/sys/kern/init_main.c,v retrieving revision 1.231 diff -u -r1.231 init_main.c --- sys/kern/init_main.c 13 May 2003 20:35:59 -0000 1.231 +++ sys/kern/init_main.c 16 May 2003 01:49:21 -0000 @@ -407,6 +407,7 @@ /* Create the file descriptor table. */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; + p->p_fdtol = NULL; mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = cmask; Index: sys/sys/proc.h =================================================================== RCS file: /home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.329 diff -u -r1.329 proc.h --- sys/sys/proc.h 16 May 2003 21:26:42 -0000 1.329 +++ sys/sys/proc.h 17 May 2003 13:47:00 -0000 @@ -510,6 +510,7 @@ TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Ptr to open files structure. */ + struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */ /* Accumulated stats for all KSEs? */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c*) Process limits. */ Index: sys/sys/filedesc.h =================================================================== RCS file: /home/ncvs/src/sys/sys/filedesc.h,v retrieving revision 1.50 diff -u -r1.50 filedesc.h --- sys/sys/filedesc.h 15 Feb 2003 05:52:56 -0000 1.50 +++ sys/sys/filedesc.h 19 May 2003 20:44:41 -0000 @@ -75,6 +75,8 @@ u_long fd_knhashmask; /* size of knhash */ struct klist *fd_knhash; /* hash table for attached knotes */ struct mtx fd_mtx; /* mtx to protect the members of struct filedesc */ + int fd_holdleaderscount; /* block fdfree() for shared close() */ + int fd_holdleaderswakeup; /* fdfree() needs wakeup */ }; /* @@ -91,6 +93,27 @@ char fd_dfileflags[NDFILE]; }; + + +/* + * Structure to keep track of (process leader, struct fildedesc) tuples. + * Each process has a pointer to such a structure when detailed tracking + * is needed. e.g. when rfork(RFPROC | RFMEM) causes a file descriptor + * table to be shared by processes having different "p_leader" pointers + * and thus distinct POSIX style locks. + * + * fdl_refcount and fdl_holdcount are protected by struct filedesc mtx. + */ +struct filedesc_to_leader { + int fdl_refcount; /* references from struct proc */ + int fdl_holdcount; /* temporary hold during closef */ + int fdl_wakeup; /* fdfree() waits on closef() */ + struct proc *fdl_leader; /* owner of POSIX locks */ + /* Circular list */ + struct filedesc_to_leader *fdl_prev; + struct filedesc_to_leader *fdl_next; +}; + /* * Per-process open flags. */ @@ -130,6 +153,12 @@ static __inline struct file * fget_locked(struct filedesc *fdp, int fd); int getvnode(struct filedesc *fdp, int fd, struct file **fpp); void setugidsafety(struct thread *td); + +struct filedesc_to_leader * +filedesc_to_leader_alloc(struct filedesc_to_leader *old, + struct filedesc *fdp, + struct proc *leader); + static __inline struct file * fget_locked(struct filedesc *fdp, int fd)