Re: fsck bug in replaying partial frag truncate journal on UFS SU+J?

From: <takehara.mikihito_at_jp.panasonic.com>
Date: Thu, 19 Jun 2014 11:34:47 +0900
Hello,

I kept analysis of the problem which I reported previously and found how to fix this problem.
(patch in my previou report is wrong.).

My understanding is there is a rule that blkno of JOP_FREEBLK or JOP_NEWBLK must be the first 
position in UFS block for each inode, for fsck_ffs's suj.c seems to support only such a case. But 
there is a case that kernel does create a JOP_FREEBLK journal which is not in this rule. The case is 
small file's partial frag truncation.

Bellow I attached a patch to fix this problem.
Adding new argument frags_offset for newfreework() to adjust for calling newjfreeblk(). New argument 
frags_offset is used by only softdep_journal_freeblocks()

diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4d0442b..2f2f063 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
_at__at_ -1012,7 +1012,7 _at__at_ static    void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
            ufs2_daddr_t, long, ufs_lbn_t);
 static struct freework *newfreework(struct ufsmount *, struct freeblks *,
-           struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
+           struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int, int);
 static int jwait(struct worklist *, int);
 static struct inodedep *inodedep_lookup_ip(struct inode *);
 static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
_at__at_ -3996,7 +3996,7 _at__at_ free_freedep(freedep)
  * is visible outside of softdep_setup_freeblocks().
  */
 static struct freework *
-newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
+newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal, frags_offset)
        struct ufsmount *ump;
        struct freeblks *freeblks;
        struct freework *parent;
_at__at_ -4005,6 +4005,7 _at__at_ newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
        int frags;
        int off;
        int journal;
+       int frags_offset;
 {
        struct freework *freework;
 
_at__at_ -4022,7 +4023,7 _at__at_ newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
                ? 0 : NINDIR(ump->um_fs) + 1;
        freework->fw_start = freework->fw_off = off;
        if (journal)
-               newjfreeblk(freeblks, lbn, nb, frags);
+               newjfreeblk(freeblks, lbn, nb - frags_offset, frags + frags_offset);
        if (parent == NULL) {
                ACQUIRE_LOCK(&lk);
                WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
_at__at_ -5958,7 +5959,7 _at__at_ setup_freedirect(freeblks, ip, i, needj)
        DIP_SET(ip, i_db[i], 0);
        frags = sblksize(ip->i_fs, ip->i_size, i);
        frags = numfrags(ip->i_fs, frags);
-       newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
+       newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj, 0);
 }
 
 static inline void
_at__at_ -5977,7 +5978,7 _at__at_ setup_freeext(freeblks, ip, i, needj)
        ip->i_din2->di_extb[i] = 0;
        frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
        frags = numfrags(ip->i_fs, frags);
-       newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
+       newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj, 0);
 }
 
 static inline void
_at__at_ -5995,7 +5996,7 _at__at_ setup_freeindir(freeblks, ip, i, lbn, needj)
                return;
        DIP_SET(ip, i_ib[i], 0);
        newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
-           0, needj);
+           0, needj, 0);
 }
 
 static inline struct freeblks *
_at__at_ -6111,7 +6112,7 _at__at_ setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
        if (off + 1 == NINDIR(ip->i_fs))
                goto nowork;
        freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
-           0);
+           0, 0);
        /*
         * Link the freework into the indirdep.  This will prevent any new
         * allocations from proceeding until we are finished with the
_at__at_ -6437,7 +6438,8 _at__at_ softdep_journal_freeblocks(ip, cred, length, flags)
                                oldfrags = numfrags(ip->i_fs, oldfrags);
                                blkno += numfrags(ip->i_fs, frags);
                                newfreework(ip->i_ump, freeblks, NULL, lastlbn,
-                                   blkno, oldfrags, 0, needj);
+                                   blkno, oldfrags, 0, needj,
+                                   numfrags(ip->i_fs, frags));
                        } else if (blkno == 0)
                                allocblock = 1;
                }
_at__at_ -7737,7 +7739,7 _at__at_ handle_workitem_freeblocks(freeblks, flags)
                                FREE_LOCK(&lk);
                                freework = newfreework(ump, freeblks, NULL,
                                    aip->ai_lbn, aip->ai_newblkno,
-                                   ump->um_fs->fs_frag, 0, 0);
+                                   ump->um_fs->fs_frag, 0, 0, 0);
                                ACQUIRE_LOCK(&lk);
                        }
                        newblk = WK_NEWBLK(wk);
_at__at_ -8014,7 +8016,7 _at__at_ indir_trunc(freework, dbn, lbn)
                        nlbn = (lbn + 1) - (i * lbnadd);
                        if (needj != 0) {
                                nfreework = newfreework(ump, freeblks, freework,
-                                   nlbn, nb, fs->fs_frag, 0, 0);
+                                   nlbn, nb, fs->fs_frag, 0, 0, 0);
                                freedeps++;
                        }
                        indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);

> Hello,
> 
> I think fsck_ffs has a bug in replaying partial frag truncate journal on UFS SU+J.
> Bellow I tested about the issue.
> 
> I tested blocksize==4KB, fragsize=512byte UFS SU+J. But I think these parameters are not related to 
> this issue.
> 1) Preparing 4096byte(1block) test file.
> 2) Use "truncate" to shorten filesize to 3584byte(7frags).
> 3) Shutdown without unmount after journal is written and before inode size ufs2_dinode is written.
> 4) Run fsck_ffs with journal.
> 5) Mount again and remove test file. Then I face panic.
>    "panic: ffs_blkfree_cg: freeing free block"
> 
> This seems to be caused by fsck_ffs to replay JOP_FREEBLK whose "blkno" is not block-aligned.
> The above case 2), JOP_FREEBLK journal is like this:
>    FREEBLK ino=5, blkno=1727, lbn=0, frags=1, oldfrags=0  <---(a)
> fsck_ffs handles JOP_NEWBLK almost same as JOP_FREEBLK. But my understanding is that in case of 
> JOP_NEWBLK kernel always create block-aligned blkno. So this issue is only in case of JOP_FREEBLK.
> 
> To analyze this issue, I also tested that using the above test file 3584byte(7frags) and write 
> 512byte with append to largen to 4096byte(1block). In this case JOP_NEWBLK journal is like this:
>    JOP_NEWBLK ino=5, blkno=1720, lbn=0, frags=8, oldfrags=7  <---(b)
> 
> I think the bug is in fsck_ffs suj.c's arround blk_check(). My understanding is that blk_check() 
> cannot handle non-block-aligned blkno so there is a little trick in blk_build() bellow.
> ----------------------------------------------------------------------------------------
>         /*                                                                              
>          * Rewrite the record using oldfrags to indicate the offset into                
>          * the block.  Leave jb_frags as the actual allocated count.                    
>          */
>         blkrec->jb_blkno -= frag;
>         blkrec->jb_oldfrags = frag;                                                     
> ----------------------------------------------------------------------------------------
> By this trick, (a) is modified like:
>    JOP_FREEBLK ino=5, blkno=1720, lbn=0, frags=1, oldfrags=7  <---(a')
> and (b) is modified like:
>    JOP_NEWBLK ino=5, blkno=1720, lbn=0, frags=8, oldfrags=0  <---(b')
> But blk_check() cannot handle the case "oldfrags!=0". If "oldfrags!=0", blk_check()'s "isat" comes 
> to be 0 even though the blk is present. So (a) should be modified same as (b')
> The following is my patch to fix like this. According to my test, this patch works fine.
> =======================================================================
> diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c
> index e21ffc6..2512522 100644
> --- a/sbin/fsck_ffs/suj.c
> +++ b/sbin/fsck_ffs/suj.c
> _at__at_ -2503,11 +2503,11 _at__at_ blk_build(struct jblkrec *blkrec)
>         frag = fragnum(fs, blkrec->jb_blkno);
>         sblk = blk_lookup(blk, 1);
>         /*
> -        * Rewrite the record using oldfrags to indicate the offset into
> -        * the block.  Leave jb_frags as the actual allocated count.
> +        * Rewrite the record to indicate the offset into the block.
>          */
>         blkrec->jb_blkno -= frag;
> -       blkrec->jb_oldfrags = frag;
> +       blkrec->jb_frags += frag;
> +       blkrec->jb_oldfrags = 0;
>         if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
>                 err_suj("Invalid fragment count %d oldfrags %d\n",
>                     blkrec->jb_frags, frag);
> =======================================================================
> 
> I think we can fix this by changing kernel code not to create non-block-aligned JOP_FREEBLK.
> But I also think it is better to change fsck by considering compatibility.
> 
> 
> _______________________________________________
> freebsd-current_at_freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "freebsd-current-unsubscribe_at_freebsd.org"
Received on Thu Jun 19 2014 - 00:55:13 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:50 UTC