Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (working copy) @@ -1041,7 +1041,7 @@ if ((itx != NULL) && (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (void) taskq_dispatch(zilog->zl_clean_taskq, - (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP); } mutex_exit(&zilog->zl_lock); } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (working copy) @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,7 +45,6 @@ uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ uint64_t zfs_write_limit_inflated = 0; uint64_t zfs_write_limit_override = 0; -extern uint64_t zfs_write_limit_min; kmutex_t zfs_write_limit_lock; @@ -91,6 +90,9 @@ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, + 1, 4, 0); + return (dp); } @@ -130,14 +132,15 @@ goto out; err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_close(dd, dp); if (err) goto out; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap); - if (err) - goto out; - dsl_dataset_rele(ds, FTAG); - dsl_dir_close(dd, dp); } /* get scrub status */ @@ -220,7 +223,6 @@ txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_dirs); - txg_list_destroy(&dp->dp_sync_tasks); list_destroy(&dp->dp_synced_datasets); arc_flush(dp->dp_spa); @@ -228,6 +230,9 @@ rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_scrub_cancel_lock); + taskq_destroy(dp->dp_vnrele_taskq); + if (dp->dp_blkstats) + kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } @@ -296,24 +301,57 @@ tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; + start = gethrtime(); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_datasets, ds); - else - dmu_buf_rele(ds->ds_dbuf, ds); + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&dp->dp_synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); + err = zio_wait(zio); - start = gethrtime(); - err = zio_wait(zio); write_time = gethrtime() - start; ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) +#ifdef notyet + /* + * + * XXX requires newer version of dmu_objset.c + */ + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) + dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx); +#endif + /* + * Sync the datasets again to push out the changes due to + * userquota updates. This must be done before we process the + * sync tasks, because that could cause a snapshot of a dataset + * whose ds_bp will be rewritten when we do this 2nd sync. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); dsl_sync_task_group_sync(dstg, tx); + } DTRACE_PROBE(pool_sync__3task); start = gethrtime(); @@ -611,3 +649,9 @@ dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } + +taskq_t * +dsl_pool_vnrele_taskq(dsl_pool_t *dp) +{ + return (dp->dp_vnrele_taskq); +} Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (working copy) @@ -93,6 +93,7 @@ * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. @@ -924,11 +925,16 @@ rl_t *rl = zgd->zgd_rl; vnode_t *vp = ZTOV(rl->r_zp); int vfslocked; + objset_t *os = rl->r_zp->z_zfsvfs->z_os; vfslocked = VFS_LOCK_GIANT(vp->v_vfsp); dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); - VN_RELE(vp); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); VFS_UNLOCK_GIANT(vfslocked); @@ -959,7 +965,12 @@ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (ENOENT); } @@ -1031,7 +1042,11 @@ } out: zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (error); } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (working copy) @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -48,7 +49,26 @@ SCRUB_FUNC_NUMFUNCS }; +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_TOTAL DMU_OT_NUMTYPES +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; +} zfs_all_blkstats_t; + + typedef struct dsl_pool { /* Immutable */ spa_t *dp_spa; @@ -57,6 +77,7 @@ struct dsl_dir *dp_mos_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; + struct taskq *dp_vnrele_taskq; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -94,6 +115,8 @@ * nobody else could possibly have it for write. */ krwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); @@ -111,6 +134,8 @@ zio_done_func_t *done, void *private, uint32_t arc_flags); void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); @@ -119,6 +144,8 @@ void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_scrub_restart(dsl_pool_t *dp); +taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); + #ifdef __cplusplus } #endif Index: sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c (working copy) @@ -41,8 +41,37 @@ #include #include +#include #include +#include +static void freebsd_vinactive(struct vnode *, struct thread *); + +/* + * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT + * flags. DOINGINACT prevents us from recursing in calls to vinactive. + * OWEINACT tracks whether a vnode missed a call to inactive due to a + * failed lock upgrade. + */ +static void +freebsd_vinactive(struct vnode *vp, struct thread *td) +{ + + ASSERT_VOP_ELOCKED(vp, "vinactive"); + ASSERT_VI_LOCKED(vp, "vinactive"); + VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, + ("vinactive: recursed on VI_DOINGINACT")); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + vp->v_iflag |= VI_DOINGINACT; + vp->v_iflag &= ~VI_OWEINACT; + VI_UNLOCK(vp); + VOP_INACTIVE(vp, td); + VI_LOCK(vp); + VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, + ("vinactive: lost VI_DOINGINACT")); + vp->v_iflag &= ~VI_DOINGINACT; +} + /* Extensible attribute (xva) routines. */ /* @@ -72,3 +101,144 @@ xoap = &xvap->xva_xoptattrs; return (xoap); } + +static void +vn_rele_inactive(vnode_t *vp) +{ + struct thread *td = curthread; + + VI_LOCK(vp); + /* + * We want to hold the vnode until the inactive finishes to + * prevent vgone() races. We drop the use count here and the + * hold count below when we're done. + */ + vp->v_usecount--; + vp->v_iflag |= VI_OWEINACT; + /* + * We must call VOP_INACTIVE with the node locked. Mark + * as VI_DOINGINACT to avoid recursion. + */ + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { + VI_LOCK(vp); + if (vp->v_usecount > 0) + vp->v_iflag &= ~VI_OWEINACT; + if (vp->v_iflag & VI_OWEINACT) + freebsd_vinactive(vp, td); + VOP_UNLOCK(vp, 0); + } else { + VI_LOCK(vp); + if (vp->v_usecount > 0) + vp->v_iflag &= ~VI_OWEINACT; + } + vdropl(vp); +} + +static STAILQ_HEAD(, vnode) vn_rele_async_list; +static struct mtx vn_rele_async_lock; +static struct cv vn_rele_async_cv; +static int vn_rele_list_length; +typedef struct { + struct vnode *stqe_next; +} vnode_link_t; + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq /* unused */) +{ + + KASSERT(vp != NULL, ("vrele: null vp")); + VFS_ASSERT_GIANT(vp->v_mount); + VI_LOCK(vp); + + if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && + vp->v_usecount == 1)) { + vp->v_usecount--; + vdropl(vp); + return; + } + if (vp->v_usecount != 1) { +#ifdef DIAGNOSTIC + vprint("vrele: negative ref count", vp); +#endif + VI_UNLOCK(vp); + panic("vrele: negative ref cnt"); + } + mtx_lock(&vn_rele_async_lock); + + /* STAILQ_INSERT_TAIL */ + (*(vnode_link_t *)&vp->v_cstart).stqe_next = NULL; + *vn_rele_async_list.stqh_last = vp; + vn_rele_async_list.stqh_last = + &((vnode_link_t *)&vp->v_cstart)->stqe_next; + /****************************************/ + + vn_rele_list_length++; + if ((vn_rele_list_length % 100) == 0) + cv_signal(&vn_rele_async_cv); + mtx_unlock(&vn_rele_async_lock); + VI_UNLOCK(vp); +} + +static void +va_setup(void *arg) +{ + + mtx_init(&vn_rele_async_lock, "valock", NULL, MTX_DEF); + STAILQ_INIT(&vn_rele_async_list); + + /* cv_init(&vn_rele_async_cv, "vacv"); */ + vn_rele_async_cv.cv_description = "vacv"; + vn_rele_async_cv.cv_waiters = 0; +} + + +static void +vrele_async_cleaner(void) +{ + STAILQ_HEAD(, vnode) vn_tmp_list; + struct vnode *curvnode; + + STAILQ_INIT(&vn_tmp_list); + mtx_lock(&vn_rele_async_lock); + for (;;) { + STAILQ_CONCAT(&vn_tmp_list, &vn_rele_async_list); + vn_rele_list_length = 0; + mtx_unlock(&vn_rele_async_lock); + + while (!STAILQ_EMPTY(&vn_tmp_list)) { + curvnode = STAILQ_FIRST(&vn_tmp_list); + + /* STAILQ_REMOVE_HEAD */ + STAILQ_FIRST(&vn_tmp_list) = + ((vnode_link_t *)&curvnode->v_cstart)->stqe_next; + if (STAILQ_FIRST(&vn_tmp_list) == NULL) + vn_tmp_list.stqh_last = &STAILQ_FIRST(&vn_tmp_list); + /***********************/ + + vn_rele_inactive(curvnode); + } + mtx_lock(&vn_rele_async_lock); + if (vn_rele_list_length == 0) + cv_timedwait(&vn_rele_async_cv, &vn_rele_async_lock, + hz/10); + } +} + +static struct proc *vreleproc; +static struct kproc_desc up_kp = { + "vaclean", + vrele_async_cleaner, + &vreleproc +}; +SYSINIT(vaclean, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); +SYSINIT(vasetup, SI_SUB_VFS, SI_ORDER_FIRST, va_setup, NULL); Index: sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h (revision 191679) +++ sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h (working copy) @@ -377,6 +377,13 @@ void xva_init(xvattr_t *); xoptattr_t *xva_getxoptattr(xvattr_t *); /* Get ptr to xoptattr_t */ +struct taskq; +void vn_rele_async(struct vnode *vp, struct taskq *taskq); + +#define VN_RELE_ASYNC(vp, taskq) { \ + vn_rele_async(vp, taskq); \ +} + /* * Flags to VOP_SETATTR/VOP_GETATTR. */