--- nfs_syscalls.c.orig 2007-10-12 03:56:27.000000000 +0000 +++ nfs_syscalls.c 2007-11-16 21:09:46.000000000 +0000 @@ -85,7 +85,6 @@ int nfsd_waiting = 0; int nfsrv_numnfsd = 0; -static int notstarted = 1; static int nfs_privport = 0; SYSCTL_INT(_vfs_nfsrv, NFS_NFSPRIVPORT, nfs_privport, CTLFLAG_RW, @@ -452,9 +451,8 @@ else procrastinate = nfsrvw_procrastinate; NFSD_UNLOCK(); - if (writes_todo || (!(nd->nd_flag & ND_NFSV3) && - nd->nd_procnum == NFSPROC_WRITE && - procrastinate > 0 && !notstarted)) + if (writes_todo || (nd->nd_procnum == NFSPROC_WRITE && + procrastinate > 0)) error = nfsrv_writegather(&nd, slp, nfsd->nfsd_td, &mreq); else --- nfs_serv.c.orig 2007-10-18 16:38:07.000000000 +0000 +++ nfs_serv.c 2007-11-16 21:09:46.000000000 +0000 @@ -106,16 +106,20 @@ #define MAX_COMMIT_COUNT (1024 * 1024) -#define NUM_HEURISTIC 1017 +#define NUM_HEURISTIC 1031 /* Must be prime! */ +#define MAX_REORDERED_RPC 4 +#define HASH_MAXSTEP 0x3ff #define NHUSE_INIT 64 #define NHUSE_INC 16 #define NHUSE_MAX 2048 +#define NH_TAG(vp) ((uint32_t)((uintptr_t)vp / sizeof(struct vnode))) +CTASSERT(NUM_HEURISTIC > (HASH_MAXSTEP + 1)); static struct nfsheur { - struct vnode *nh_vp; /* vp to match (unreferenced pointer) */ - off_t nh_nextr; /* next offset for sequential detection */ - int nh_use; /* use count for selection */ - int nh_seqcount; /* heuristic */ + off_t nh_nextoff; /* next offset for sequential detection */ + uint32_t nh_tag; /* vp tag to match */ + uint16_t nh_use; /* use count for selection */ + uint16_t nh_seqcount; /* in units of logical blocks */ } nfsheur[NUM_HEURISTIC]; /* Global vars */ @@ -130,10 +134,16 @@ static int nfs_async; static int nfs_commit_blks; static int nfs_commit_miss; +static int nfsrv_cluster_writes = 1; +static int nfsrv_cluster_reads = 1; +static int nfsrv_reordered_io; SYSCTL_INT(_vfs_nfsrv, OID_AUTO, async, CTLFLAG_RW, &nfs_async, 0, ""); SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks, 0, ""); SYSCTL_INT(_vfs_nfsrv, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss, 0, ""); +SYSCTL_INT(_vfs_nfsrv, OID_AUTO, cluster_writes, CTLFLAG_RW, &nfsrv_cluster_writes, 0, ""); +SYSCTL_INT(_vfs_nfsrv, OID_AUTO, cluster_reads, CTLFLAG_RW, &nfsrv_cluster_reads, 0, ""); +SYSCTL_INT(_vfs_nfsrv, OID_AUTO, reordered_io, CTLFLAG_RW, &nfsrv_reordered_io, 0, ""); struct nfsrvstats nfsrvstats; SYSCTL_STRUCT(_vfs_nfsrv, NFS_NFSRVSTATS, nfsrvstats, CTLFLAG_RW, &nfsrvstats, nfsrvstats, "S,nfsrvstats"); @@ -144,6 +154,75 @@ struct nfsrv_descript *); /* + * Detect sequential access so that we are able to hint the underlying + * file system to use clustered I/O when appropriate. + */ +static int +nfsrv_sequential_access(const struct uio *uio, const struct vnode *vp) +{ + struct nfsheur *nh; + unsigned hi, step; + int try = 8; + int nblocks, lblocksize; + + /* + * Locate best nfsheur[] candidate using double hashing. + */ + + hi = NH_TAG(vp) % NUM_HEURISTIC; + step = NH_TAG(vp) & HASH_MAXSTEP; + step++; /* Step must not be zero. */ + nh = &nfsheur[hi]; + + while (try--) { + if (nfsheur[hi].nh_tag == NH_TAG(vp)) { + nh = &nfsheur[hi]; + break; + } + if (nfsheur[hi].nh_use > 0) + --nfsheur[hi].nh_use; + hi = hi + step; + if (hi >= NUM_HEURISTIC) + hi -= NUM_HEURISTIC; + if (nfsheur[hi].nh_use < nh->nh_use) + nh = &nfsheur[hi]; + } + + if (nh->nh_tag != NH_TAG(vp)) { /* New entry. */ + nh->nh_tag = NH_TAG(vp); + nh->nh_nextoff = uio->uio_offset; + nh->nh_use = NHUSE_INIT; + nh->nh_seqcount = 1; /* Initially assume sequential access. */ + } else { + nh->nh_use += NHUSE_INC; + if (nh->nh_use > NHUSE_MAX) + nh->nh_use = NHUSE_MAX; + } + + /* + * Calculate heuristic + */ + + lblocksize = vp->v_mount->mnt_stat.f_iosize; + nblocks = howmany(uio->uio_resid, lblocksize); + if (uio->uio_offset == nh->nh_nextoff) { + nh->nh_seqcount += nblocks; + if (nh->nh_seqcount > IO_SEQMAX) + nh->nh_seqcount = IO_SEQMAX; + } else if (uio->uio_offset == 0) { + /* Seek to beginning of file, ignored. */ + } else if (qabs(uio->uio_offset - nh->nh_nextoff) <= + MAX_REORDERED_RPC*imax(lblocksize, uio->uio_resid)) { + nfsrv_reordered_io++; /* Probably reordered RPC, do nothing. */ + } else + nh->nh_seqcount /= 4; /* Not sequential access. */ + + nh->nh_nextoff = uio->uio_offset + uio->uio_resid; + + return (nh->nh_seqcount << IO_SEQSHIFT); +} + +/* * Clear nameidata fields that are tested in nsfmout cleanup code prior * to using first nfsm macro (that might jump to the cleanup code). */ @@ -783,7 +862,6 @@ fhandle_t *fhp; struct uio io, *uiop = &io; struct vattr va, *vap = &va; - struct nfsheur *nh; off_t off; int ioflag = 0; int vfslocked; @@ -855,61 +933,6 @@ else cnt = reqlen; - /* - * Calculate seqcount for heuristic - */ - - { - int hi; - int try = 32; - - /* - * Locate best candidate - */ - - hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC; - nh = &nfsheur[hi]; - - while (try--) { - if (nfsheur[hi].nh_vp == vp) { - nh = &nfsheur[hi]; - break; - } - if (nfsheur[hi].nh_use > 0) - --nfsheur[hi].nh_use; - hi = (hi + 1) % NUM_HEURISTIC; - if (nfsheur[hi].nh_use < nh->nh_use) - nh = &nfsheur[hi]; - } - - if (nh->nh_vp != vp) { - nh->nh_vp = vp; - nh->nh_nextr = off; - nh->nh_use = NHUSE_INIT; - if (off == 0) - nh->nh_seqcount = 4; - else - nh->nh_seqcount = 1; - } - - /* - * Calculate heuristic - */ - - if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) { - if (++nh->nh_seqcount > IO_SEQMAX) - nh->nh_seqcount = IO_SEQMAX; - } else if (nh->nh_seqcount > 1) { - nh->nh_seqcount = 1; - } else { - nh->nh_seqcount = 0; - } - nh->nh_use += NHUSE_INC; - if (nh->nh_use > NHUSE_MAX) - nh->nh_use = NHUSE_MAX; - ioflag |= nh->nh_seqcount << IO_SEQSHIFT; - } - nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt)); if (v3) { tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED); @@ -967,9 +990,10 @@ uiop->uio_resid = len; uiop->uio_rw = UIO_READ; uiop->uio_segflg = UIO_SYSSPACE; + if (nfsrv_cluster_reads) + ioflag |= nfsrv_sequential_access(uiop, vp); error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred); off = uiop->uio_offset; - nh->nh_nextr = off; FREE((caddr_t)iv2, M_TEMP); if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) { if (!error) @@ -1175,6 +1199,8 @@ uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = NULL; uiop->uio_offset = off; + if (nfsrv_cluster_writes) + ioflags |= nfsrv_sequential_access(uiop, vp); error = VOP_WRITE(vp, uiop, ioflags, cred); /* XXXRW: unlocked write. */ nfsrvstats.srvvop_writes++; @@ -1223,6 +1249,7 @@ vput(vp); vn_finished_write(mntp); VFS_UNLOCK_GIANT(vfslocked); + bwillwrite(); /* After VOP_WRITE to avoid reordering. */ return(error); } @@ -1489,6 +1516,8 @@ mvfslocked = VFS_LOCK_GIANT(mntp); } if (!error) { + if (nfsrv_cluster_writes) + ioflags |= nfsrv_sequential_access(uiop, vp); error = VOP_WRITE(vp, uiop, ioflags, cred); /* XXXRW: unlocked write. */ nfsrvstats.srvvop_writes++; @@ -1579,6 +1608,7 @@ break; } splx(s); + bwillwrite(); /* After VOP_WRITE to avoid reordering. */ return (0); } @@ -3824,7 +3854,11 @@ } for_ret = VOP_GETATTR(vp, &bfor, cred, td); - if (cnt > MAX_COMMIT_COUNT) { + /* + * If count is 0, a flush from offset to the end of file + * should be performed according to RFC 1813. + */ + if (cnt == 0 || cnt > MAX_COMMIT_COUNT) { /* * Give up and do the whole thing */ @@ -3868,7 +3902,7 @@ s = splbio(); VI_LOCK(vp); - while (cnt > 0) { + while (!error && cnt > 0) { struct buf *bp; /* @@ -3891,7 +3925,7 @@ B_DELWRI) { bremfree(bp); bp->b_flags &= ~B_ASYNC; - bwrite(bp); + error = bwrite(bp); ++nfs_commit_miss; } else BUF_UNLOCK(bp);