I'll spare a lengthy write-up because I think the patch documents it well enough. It certainly appears to fix things here when doing very large block-sized writes, but it also reduces the throughput with those block sizes. (I don't think there should be any difference when using reasonable block sizes). Would anyone care to take a shot at fixing it in a more elegant manner? Index: sys/buf.h =================================================================== RCS file: /export/ncvs/src/sys/sys/buf.h,v retrieving revision 1.167.2.1 diff -u -r1.167.2.1 buf.h --- sys/buf.h 31 Jan 2005 23:26:55 -0000 1.167.2.1 +++ sys/buf.h 15 Apr 2005 02:00:44 -0000 _at__at_ -469,6 +469,7 _at__at_ extern int maxswzone; /* Max KVA for swap structures */ extern int maxbcache; /* Max KVA for buffer cache */ extern int runningbufspace; +extern int hibufspace; extern int buf_maxio; /* nominal maximum I/O for buffer */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ Index: kern/vfs_bio.c =================================================================== RCS file: /export/ncvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.444.2.2 diff -u -r1.444.2.2 vfs_bio.c --- kern/vfs_bio.c 31 Jan 2005 23:26:18 -0000 1.444.2.2 +++ kern/vfs_bio.c 15 Apr 2005 01:59:38 -0000 _at__at_ -113,7 +113,7 _at__at_ static int lobufspace; SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); -static int hibufspace; +int hibufspace; SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; Index: nfsclient/nfs_bio.c =================================================================== RCS file: /export/ncvs/src/sys/nfsclient/nfs_bio.c,v retrieving revision 1.133.2.2 diff -u -r1.133.2.2 nfs_bio.c --- nfsclient/nfs_bio.c 31 Jan 2005 23:26:46 -0000 1.133.2.2 +++ nfsclient/nfs_bio.c 15 Apr 2005 04:41:13 -0000 _at__at_ -726,6 +726,7 _at__at_ struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); daddr_t lbn; + off_t commitleft; int bcount; int n, on, error = 0; int haverslock = 0; _at__at_ -755,6 +756,7 _at__at_ */ if (ioflag & (IO_APPEND | IO_SYNC)) { if (np->n_flag & NMODIFIED) { +flush_and_restart: np->n_attrstamp = 0; error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); if (error) _at__at_ -832,12 +834,65 _at__at_ } biosize = vp->v_mount->mnt_stat.f_iosize; + commitleft = 0; + /* + * If there are possible modifications, then there may be some + * B_NEEDCOMMIT buffers. Total those up here and force a flush + * before starting to write if our writes can exceed the local + * maximum per-file write commit size. + * + * If there are no possible pending modifications, we still need + * to limit our write to that size. + */ + if ((ioflag & (IO_SYNC | IO_INVAL)) != (IO_SYNC | IO_INVAL)) { + commitleft = nmp->nm_wcommitsize; + if (np->n_flag & NMODIFIED) { + int wouldcommit = 0; + VI_LOCK(vp); + TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { + if (bp->b_flags & B_NEEDCOMMIT) + wouldcommit += bp->b_bcount; + } + VI_UNLOCK(vp); + /* + * Since we're not operating synchronously and + * bypassing the buffer cache, we are in a commit + * and holding all of these buffers whether + * transmitted or not. If not limited, this + * will lead to the buffer cache deadlocking, + * as no one else can flush our uncommitted buffers. + */ + wouldcommit += uio->uio_resid; + /* + * If we would initially exceed the maximum + * outstanding write commit size, flush and restart. + */ + if (wouldcommit > commitleft) { + if (haverslock) { + nfs_rsunlock(np, td); + haverslock = 0; + } + goto flush_and_restart; + } + } else { + /* + * With no outstanding commits, we are limited only + * by commitleft as to how far we can go. + */ + } + } do { nfsstats.biocache_writes++; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize-1); n = min((unsigned)(biosize - on), uio->uio_resid); + /* Always allow at least one write. */ + if (commitleft > 0) { + commitleft -= n; + if (commitleft == 0) + commitleft = -1; + } again: /* * Handle direct append and file extension cases, calculate _at__at_ -932,12 +987,6 _at__at_ break; } } - if (!bp) { - error = nfs_sigintr(nmp, NULL, td); - if (!error) - error = EINTR; - break; - } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); np->n_flag |= NMODIFIED; _at__at_ -1036,7 +1085,7 _at__at_ } else { bdwrite(bp); } - } while (uio->uio_resid > 0 && n > 0); + } while (uio->uio_resid > 0 && n > 0 && commitleft >= 0); if (haverslock) nfs_rsunlock(np, td); Index: nfsclient/nfs_vfsops.c =================================================================== RCS file: /export/ncvs/src/sys/nfsclient/nfs_vfsops.c,v retrieving revision 1.158.2.3 diff -u -r1.158.2.3 nfs_vfsops.c --- nfsclient/nfs_vfsops.c 31 Jan 2005 23:26:46 -0000 1.158.2.3 +++ nfsclient/nfs_vfsops.c 15 Apr 2005 02:03:05 -0000 _at__at_ -41,6 +41,8 _at__at_ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/buf.h> #include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> _at__at_ -625,6 +627,12 _at__at_ else nmp->nm_readahead = NFS_MAXRAHEAD; } + if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) { + if (argp->wcommitsize < nmp->nm_wsize) + nmp->nm_wcommitsize = nmp->nm_wsize; + else + nmp->nm_wcommitsize = argp->wcommitsize; + } if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) { if (argp->deadthresh <= NFS_MAXDEADTHRESH) nmp->nm_deadthresh = argp->deadthresh; _at__at_ -785,6 +793,7 _at__at_ nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; } + nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000); nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; Index: nfsclient/nfsargs.h =================================================================== RCS file: /export/ncvs/src/sys/nfsclient/nfsargs.h,v retrieving revision 1.66.2.1 diff -u -r1.66.2.1 nfsargs.h --- nfsclient/nfsargs.h 31 Jan 2005 23:26:46 -0000 1.66.2.1 +++ nfsclient/nfsargs.h 15 Apr 2005 01:33:08 -0000 _at__at_ -56,7 +56,7 _at__at_ int retrans; /* times to retry send */ int maxgrouplist; /* Max. size of group list */ int readahead; /* # of blocks to readahead */ - int __pad1; /* was "leaseterm" */ + int wcommitsize; /* Max. write commit size in bytes */ int deadthresh; /* Retrans threshold */ char *hostname; /* server's name */ int acregmin; /* cache attrs for reg files min time */ _at__at_ -80,7 +80,7 _at__at_ #define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */ /* 0x400 free, was NFSMNT_KERB */ #define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */ -/* 0x1000 free, was NFSMNT_LEASETERM */ +#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */ #define NFSMNT_READAHEAD 0x00002000 /* set read ahead */ #define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */ #define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */ Index: nfsclient/nfsmount.h =================================================================== RCS file: /export/ncvs/src/sys/nfsclient/nfsmount.h,v retrieving revision 1.27.2.1 diff -u -r1.27.2.1 nfsmount.h --- nfsclient/nfsmount.h 31 Jan 2005 23:26:46 -0000 1.27.2.1 +++ nfsclient/nfsmount.h 15 Apr 2005 01:21:57 -0000 _at__at_ -66,6 +66,7 _at__at_ int nm_wsize; /* Max size of write rpc */ int nm_readdirsize; /* Size of a readdir rpc */ int nm_readahead; /* Num. of blocks to readahead */ + int nm_wcommitsize; /* Max size of commit for write */ int nm_acdirmin; /* Directory attr cache min lifetime */ int nm_acdirmax; /* Directory attr cache max lifetime */ int nm_acregmin; /* Reg file attr cache min lifetime */ -- Brian Fundakowski Feldman \'[ FreeBSD ]''''''''''\ <> green_at_FreeBSD.org \ The Power to Serve! \ Opinions expressed are my own. \,,,,,,,,,,,,,,,,,,,,,,\Received on Fri Apr 15 2005 - 03:07:37 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:38:32 UTC