Re: Nasty non-recursive lockmgr panic on softdep only enabled UFS partition when filesystem full

From: Garrett Cooper <yanegomi_at_gmail.com>
Date: Thu, 5 May 2011 12:49:48 -0700
On Thu, May 5, 2011 at 10:36 AM, Kostik Belousov <kostikbel_at_gmail.com> wrote:
> On Thu, May 05, 2011 at 10:23:47AM -0700, Garrett Cooper wrote:
>> On May 4, 2011, at 2:07 AM, Kostik Belousov wrote:
>>
>> > On Tue, May 03, 2011 at 11:58:49PM -0700, Garrett Cooper wrote:
>> >> On Tue, May 3, 2011 at 11:42 PM, Garrett Cooper <yanegomi_at_gmail.com> wrote:
>> >>> On Tue, May 3, 2011 at 10:59 PM, Kirk McKusick <mckusick_at_mckusick.com> wrote:
>> >>>>> Date: Tue, 3 May 2011 22:40:26 -0700
>> >>>>> Subject: Nasty non-recursive lockmgr panic on softdep only enabled UFS
>> >>>>>  partition when filesystem full
>> >>>>> From: Garrett Cooper <yanegomi_at_gmail.com>
>> >>>>> To: Jeff Roberson <jeff_at_freebsd.org>,
>> >>>>>         Marshall Kirk McKusick <mckusick_at_mckusick.com>
>> >>>>> Cc: FreeBSD Current <freebsd-current_at_freebsd.org>
>> >>>>>
>> >>>>> Hi Jeff and Dr. McKusick,
>> >>>>>     Ran into this panic when /usr ran out of space doing a make
>> >>>>> universe on amd64/r221219 (it took ~15 minutes for the panic to occur
>> >>>>> after the filesystem ran out of space -- wasn't quite sure what it was
>> >>>>> doing at the time):
>> >>>>>
>> >>>>> ...
>> >>>>>
>> >>>>>     Let me know what other commands you would like for me to run in kgdb.
>> >>>>> Thanks,
>> >>>>> -Garrett
>> >>>>
>> >>>> You did not indicate whether you are running an 8.X system or a 9-current
>> >>>> system. It would be helpful to know that.
>> >>>
>> >>> I've actually been running CURRENT for a few years now, but you're right --
>> >>> I didn't mention that part.
>> >>>
>> >>>> Jeff thinks that there may be a potential race in the locking code for
>> >>>> softdep_request_cleanup. If so, this patch for 9-current should fix it:
>> >>>>
>> >>>> Index: ffs_softdep.c
>> >>>> ===================================================================
>> >>>> --- ffs_softdep.c       (revision 221385)
>> >>>> +++ ffs_softdep.c       (working copy)
>> >>>> _at__at_ -11380,7 +11380,8 _at__at_
>> >>>>                                continue;
>> >>>>                        }
>> >>>>                        MNT_IUNLOCK(mp);
>> >>>> -                       if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
>> >>>> +                       if (vget(lvp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
>> >>>> +                           curthread)) {
>> >>>>                                MNT_ILOCK(mp);
>> >>>>                                continue;
>> >>>>                        }
>> >>>>
>> >>>> If you are running an 8.X system, hopefully you will be able to apply it.
>> >>>
>> >>>    I've applied it, rebuilt and installed the kernel, and trying to
>> >>> repro the case again. Will let you know how things go!
>> >>
>> >>    Happened again with the change. It's really easy to repro:
>> >>
>> >> 1. Get a filesystem with UFS+SU
>> >> 2. Execute something that does a large number of small writes to a partition.
>> >> 3. 'dd if=/dev/zero of=FOO bs=10m' on the same partition
>> >>
>> >>    The kernel will panic with the issue I discussed above.
>> >> Thanks!
>> >
>> > Jeff' change is required to avoid LORs, but it is not sufficient to
>> > prevent recursion. We must skip the vnode supplied as a parameter to
>> > softdep_request_cleanup(). Theoretically, other vnodes might be also
>> > locked by curthread, thus I think the change below is needed. Try this.
>> >
>> > diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
>> > index a6d4441..25fa5d6 100644
>> > --- a/sys/ufs/ffs/ffs_softdep.c
>> > +++ b/sys/ufs/ffs/ffs_softdep.c
>> > _at__at_ -11380,7 +11380,9 _at__at_ retry:
>> >                             continue;
>> >                     }
>> >                     MNT_IUNLOCK(mp);
>> > -                   if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
>> > +                   if (VOP_ISLOCKED(lvp) ||
>> > +                       vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
>> > +                       curthread)) {
>> >                             MNT_ILOCK(mp);
>> >                             continue;
>> >                     }
>>
>>       Ran into the same panic after I applied the patch above with the repro steps I described before. One thing that I noticed is that the issue isn't as easy to reproduce unless you add the dd in parallel with the make operation.
>
> Well, I misread your original report. Also, there is another issue
> that is easily reproducable in similar situation. The latest patch
> is below.
>
> diff --git a/sys/sys/mount.h b/sys/sys/mount.h
> index 231e3d6..f064053 100644
> --- a/sys/sys/mount.h
> +++ b/sys/sys/mount.h
> _at__at_ -366,6 +366,8 _at__at_ void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
>  #define MNT_LAZY       3       /* push data not written by filesystem syncer */
>  #define MNT_SUSPEND    4       /* Suspend file system after sync */
>
> +#define        MNT_WAIT_ADV    0x10000000      /* MNT_WAIT prevent deadlock */
> +
>  /*
>  * Generic file handle
>  */
> diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
> index e60514d..87837cc 100644
> --- a/sys/ufs/ffs/ffs_alloc.c
> +++ b/sys/ufs/ffs/ffs_alloc.c
> _at__at_ -420,13 +420,13 _at__at_ nospace:
>         */
>        if (reclaimed == 0) {
>                reclaimed = 1;
> -               softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
> -               UFS_UNLOCK(ump);
>                if (bp) {
> +                       UFS_UNLOCK(ump);
>                        brelse(bp);
>                        bp = NULL;
> +                       UFS_LOCK(ump);
>                }
> -               UFS_LOCK(ump);
> +               softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
>                goto retry;
>        }
>        UFS_UNLOCK(ump);
> diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
> index d819c8a..d12e1dc 100644
> --- a/sys/ufs/ffs/ffs_extern.h
> +++ b/sys/ufs/ffs/ffs_extern.h
> _at__at_ -141,7 +141,7 _at__at_ void        softdep_setup_inofree(struct mount *, struct buf *, ino_t,
>  void   softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
>  void   *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
>  void   softdep_fsync_mountdev(struct vnode *);
> -int    softdep_sync_metadata(struct vnode *);
> +int    softdep_sync_metadata(struct vnode *, int flags);
>  int     softdep_process_worklist(struct mount *, int);
>  int     softdep_fsync(struct vnode *);
>  int    softdep_waitidle(struct mount *);
> diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
> index a6d4441..0b66e68 100644
> --- a/sys/ufs/ffs/ffs_softdep.c
> +++ b/sys/ufs/ffs/ffs_softdep.c
> _at__at_ -492,7 +492,7 _at__at_ softdep_flushworklist(oldmnt, countp, td)
>  }
>
>  int
> -softdep_sync_metadata(struct vnode *vp)
> +softdep_sync_metadata(struct vnode *vp, int flags)
>  {
>
>        return (0);
> _at__at_ -733,7 +733,7 _at__at_ static      void unlinked_inodedep(struct mount *, struct inodedep *);
>  static void clear_unlinked_inodedep(struct inodedep *);
>  static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
>  static int flush_pagedep_deps(struct vnode *, struct mount *,
> -           struct diraddhd *);
> +           struct diraddhd *, int);
>  static void free_pagedep(struct pagedep *);
>  static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
>  static int flush_inodedep_deps(struct mount *, ino_t);
> _at__at_ -10662,7 +10662,7 _at__at_ restart:
>  * associated with the file. If any I/O errors occur, they are returned.
>  */
>  int
> -softdep_sync_metadata(struct vnode *vp)
> +softdep_sync_metadata(struct vnode *vp, int flags)
>  {
>        struct pagedep *pagedep;
>        struct allocindir *aip;
> _at__at_ -10792,7 +10792,8 _at__at_ loop:
>                                        continue;
>                                if ((error =
>                                    flush_pagedep_deps(vp, wk->wk_mp,
> -                                               &pagedep->pd_diraddhd[i]))) {
> +                                               &pagedep->pd_diraddhd[i],
> +                                               flags))) {
>                                        FREE_LOCK(&lk);
>                                        goto loop_end;
>                                }
> _at__at_ -11056,10 +11057,11 _at__at_ flush_newblk_dep(vp, mp, lbn)
>  * Called with splbio blocked.
>  */
>  static int
> -flush_pagedep_deps(pvp, mp, diraddhdp)
> +flush_pagedep_deps(pvp, mp, diraddhdp, flags)
>        struct vnode *pvp;
>        struct mount *mp;
>        struct diraddhd *diraddhdp;
> +       int flags;
>  {
>        struct inodedep *inodedep;
>        struct inoref *inoref;
> _at__at_ -11069,8 +11071,13 _at__at_ flush_pagedep_deps(pvp, mp, diraddhdp)
>        int error = 0;
>        struct buf *bp;
>        ino_t inum;
> +       int lkflags;
>
>        ump = VFSTOUFS(mp);
> +       lkflags = LK_EXCLUSIVE;
> +       if ((flags & MNT_WAIT_ADV) != 0)
> +               lkflags |= LK_NOWAIT;
> +
>  restart:
>        while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
>                /*
> _at__at_ -11112,7 +11119,7 _at__at_ restart:
>                }
>                if (dap->da_state & MKDIR_BODY) {
>                        FREE_LOCK(&lk);
> -                       if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
> +                       if ((error = ffs_vgetf(mp, inum, lkflags, &vp,
>                            FFSV_FORCEINSMQ)))
>                                break;
>                        error = flush_newblk_dep(vp, mp, 0);
> _at__at_ -11176,7 +11183,7 _at__at_ retry:
>                 */
>                if (dap == LIST_FIRST(diraddhdp)) {
>                        FREE_LOCK(&lk);
> -                       if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
> +                       if ((error = ffs_vgetf(mp, inum, lkflags, &vp,
>                            FFSV_FORCEINSMQ)))
>                                break;
>                        error = ffs_update(vp, MNT_WAIT);
> _at__at_ -11379,17 +11386,17 _at__at_ retry:
>                                VI_UNLOCK(lvp);
>                                continue;
>                        }
> -                       MNT_IUNLOCK(mp);
> -                       if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
> -                               MNT_ILOCK(mp);
> +                       if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
> +                           curthread)) {
>                                continue;
>                        }
> +                       MNT_IUNLOCK(mp);
>                        if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
>                                vput(lvp);
>                                MNT_ILOCK(mp);
>                                continue;
>                        }
> -                       (void) ffs_syncvnode(lvp, MNT_WAIT);
> +                       (void) ffs_syncvnode(lvp, MNT_WAIT | MNT_WAIT_ADV);
>                        vput(lvp);
>                        MNT_ILOCK(mp);
>                }
> diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
> index cf6a5a8..c73e2a5 100644
> --- a/sys/ufs/ffs/ffs_vnops.c
> +++ b/sys/ufs/ffs/ffs_vnops.c
> _at__at_ -216,9 +216,11 _at__at_ ffs_syncvnode(struct vnode *vp, int waitfor)
>        struct bufobj *bo;
>        struct buf *bp;
>        struct buf *nbp;
> -       int s, error, wait, passes, skipmeta;
> +       int s, error, wait, passes, skipmeta, wait_adv;
>        ufs_lbn_t lbn;
>
> +       wait_adv = waitfor & MNT_WAIT_ADV;
> +       waitfor &= ~MNT_WAIT_ADV;
>        wait = (waitfor == MNT_WAIT);
>        lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
>        bo = &vp->v_bufobj;
> _at__at_ -328,7 +330,7 _at__at_ loop:
>                 * with the vnode has been written.
>                 */
>                splx(s);
> -               if ((error = softdep_sync_metadata(vp)) != 0)
> +               if ((error = softdep_sync_metadata(vp, wait_adv)) != 0)
>                        return (error);
>                s = splbio();

    Things look ok with that patch and the one that Jeff provided for
the LOR, taking into account your style change with the flag list.
Thanks!
-Garrett
Received on Thu May 05 2011 - 17:49:50 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:13 UTC