Re: sysctl spinning (was: Re: ps Causes Hard Hang)

From: Don Lewis <truckman_at_FreeBSD.org> Date: Thu, 4 Mar 2004 22:56:27 -0800 (PST) · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:46 UTC

On  4 Mar, Don Lewis wrote:
> On  5 Mar, Bruce Evans wrote:

>> POSIX is more authoritative.  It says that ENOMEM may be returned if
>> an implementation-defined limit would be exceeded, and that EAGAIN
>> shall be returned if some or all of the memory could not be locked
>> when the call was made.  So it literally requires returning EAGAIN
>> even when the limit would be exceeded, but it doesn't mean to require
>> that.
> 
> That's the same as SUSv3.
> 
> I was pretty frazzled when I sent my previous message and was thinking
> the test was something like:
> 	if (atop(size) > vm_page_max_wired)

>>> > I think EAGAIN was only meant for retrying after transient changes
>>> > to the data.
>>>
>>> I think there may be legitimate cases where our memory wiring
>>> implementation would legitimately want to return EAGAIN and not cause a
>>> tight loop, but there is currently no way to distinguish this from the
>>> case that you mention.
>> 
>> I think it should return EAGAIN except when certain limits including the
>> rlimit would be exceeded.  The rlimit is often large or infinity, so
>> it must be easy to exceed the system-wide limit if lots of processes uses
>> locked memory.  Perhaps every process should be guaranteed a few pages of
>> locked memory so that most systctls always work.  This is like guaranteeing
>> that a minimum number of files (20 or so) can be opened, but I don't
>> believe we even guarantee that, especially when the files are pipes
>> wanting 64K of wired memory each.
> 
> One problem is that we should really distinguish between "permanently"
> wired memory (from mlock(), etc.), from memory that is transiently wired
> by sysctl() and device drivers.   I started regretting my change that
> merged the mlock() and vslock() implementations after I realized this.
> 
> Permanent requests should be rejected while there is still memory
> available for transient requests.
> 
> All transient requests should be checked against the limits.
> 
> There should be a per-uid limit on total wired memory so that a single
> user can't DoS the system by consuming all the allowable wired memory
> even if the user forks a large number of processes to take full
> advantage of any per-process limits.
> 
> There should be a way of queuing transient requests, either blocking
> transparently within the API, or by allowing the caller to sleep and be
> woken when the request would succeed.

I'm currently running the patch below.

This patch backs out the vslock()/mlock() merger.  I retained the best
parts of the old mlock() and vslock() implementations with some
additional cleanup.  Both implementations are now as similar as
possible.

This patch retains the resource limit checking that I enabled in
mlock(), and it rearranges the order of the resource limit checks, like
in my previous patch, so that the test that could return EAGAIN is the
last of the bunch.

I have disabled the EAGAIN return in vslock() since it breaks the kernel
sysctl code.  At least the interface used by sysctl will obey the
RLIMIT_MEMLOCK limit and an individual request can't exceed
vm_page_max_wired.

I'd like to commit this patch soon to fix the kernel breakage that a
number of people have experienced.

Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.151
diff -u -r1.151 kern_sysctl.c
--- sys/kern/kern_sysctl.c	27 Feb 2004 17:13:23 -0000	1.151
+++ sys/kern/kern_sysctl.c	5 Mar 2004 05:58:40 -0000
_at__at_ -1000,7 +1000,7 _at__at_
 	error = sysctl_root(0, name, namelen, &req);

 	if (req.lock == REQ_WIRED)
-		kern_munlock(req.td, (vm_offset_t)req.oldptr,
+		vsunlock(req.td, (vm_offset_t)req.oldptr,
 		    (vm_size_t)req.wiredlen);

 	SYSCTL_UNLOCK();
_at__at_ -1103,7 +1103,7 _at__at_
 	ret = 0;
 	if (req->lock == REQ_LOCKED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
-		ret = kern_mlock(req->td, (vm_offset_t)req->oldptr,
+		ret = vslock(req->td, (vm_offset_t)req->oldptr,
 		    (vm_size_t)wiredlen);
 		if (ret == 0) {
 			req->lock = REQ_WIRED;
_at__at_ -1320,7 +1320,7 _at__at_

 	req = req2;
 	if (req.lock == REQ_WIRED)
-		kern_munlock(req.td, (vm_offset_t)req.oldptr,
+		vsunlock(req.td, (vm_offset_t)req.oldptr,
 		    (vm_size_t)req.wiredlen);

 	SYSCTL_UNLOCK();
Index: sys/vm/vm_extern.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_extern.h,v
retrieving revision 1.69
diff -u -r1.69 vm_extern.h
--- sys/vm/vm_extern.h	26 Feb 2004 00:27:04 -0000	1.69
+++ sys/vm/vm_extern.h	5 Mar 2004 05:42:44 -0000
_at__at_ -59,8 +59,6 _at__at_
 int swapon(struct thread *, void *, int *);
 #endif			/* TYPEDEF_FOR_UAP */

-int kern_mlock(struct thread *, vm_offset_t, vm_size_t);
-int kern_munlock(struct thread *, vm_offset_t, vm_size_t);
 int kernacc(void *, int, int);
 vm_offset_t kmem_alloc(vm_map_t, vm_size_t);
 vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t);
_at__at_ -88,6 +86,8 _at__at_
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
+int vslock(struct thread *, vm_offset_t, vm_size_t);
+int vsunlock(struct thread *, vm_offset_t, vm_size_t);
 void vm_object_print(/* db_expr_t */ long, boolean_t, /* db_expr_t */ long,
 			  char *);
 int vm_fault_quick(caddr_t v, int prot);
Index: sys/vm/vm_glue.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_glue.c,v
retrieving revision 1.190
diff -u -r1.190 vm_glue.c
--- sys/vm/vm_glue.c	26 Feb 2004 00:27:04 -0000	1.190
+++ sys/vm/vm_glue.c	5 Mar 2004 06:38:36 -0000
_at__at_ -184,6 +184,84 _at__at_
 }

 /*
+ * MPSAFE
+ */
+int
+vslock(td, addr, size)
+	struct thread *td;
+	vm_offset_t addr;
+	vm_size_t size;
+{
+	vm_offset_t start, end;
+	struct proc *proc = td->td_proc;
+	int error, npages;
+
+	start = trunc_page(addr);
+	end = round_page(addr + size);
+
+	/* disable wrap around */
+	if (end <= start)
+		return (EINVAL);
+
+	npages = atop(end - start);
+
+	if (npages > vm_page_max_wired)
+		return (ENOMEM);
+
+	PROC_LOCK(proc);
+	if (npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)) >
+	    atop(lim_cur(proc, RLIMIT_MEMLOCK))) {
+		PROC_UNLOCK(proc);
+		return (ENOMEM);
+	}
+	PROC_UNLOCK(proc);
+
+#if 0
+	/*
+	 * XXX - not yet
+	 *
+	 * The limit for transient usage of wired pages should be
+	 * larger than for "permanent" wired pages (mlock()).
+	 *
+	 * Also, the sysctl code, which is the only present user
+	 * of vslock(), does a hard loop on EAGAIN.
+	 */
+	if (npages + cnt.v_wire_count > vm_page_max_wired)
+		return (EAGAIN);
+#endif
+
+	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
+	     VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+	
+	/* EINVAL is probably a better error to return than ENOMEM */
+	return (error == KERN_SUCCESS ? 0 : EINVAL);
+}
+
+/*
+ * MPSAFE
+ */
+int
+vsunlock(td, addr, size)
+	struct thread *td;
+	vm_offset_t addr;
+	vm_size_t size;
+{
+	vm_offset_t start, end;
+	int error;
+
+	start = trunc_page(addr);
+	end = round_page(addr + size);
+
+	/* disable wrap around */
+	if (end <= start)
+		return (EINVAL);
+
+	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
+	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
+	return (error == KERN_SUCCESS ? 0 : EINVAL);
+}
+
+/*
  * Create the U area for a new process.
  * This routine directly affects the fork perf for a process.
  */
Index: sys/vm/vm_mmap.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_mmap.c,v
retrieving revision 1.181
diff -u -r1.181 vm_mmap.c
--- sys/vm/vm_mmap.c	1 Mar 2004 02:44:33 -0000	1.181
+++ sys/vm/vm_mmap.c	5 Mar 2004 06:03:19 -0000
_at__at_ -893,49 +893,42 _at__at_
 	struct thread *td;
 	struct mlock_args *uap;
 {
-	int error;
+	struct proc *proc = td->td_proc;
+	vm_offset_t addr, start, end;
+	vm_size_t size;
+	int error, npages;

 	error = suser(td);
 	if (error)
 		return (error);
-	return (kern_mlock(td, (vm_offset_t)uap->addr, (vm_size_t)uap->len));
-}

-/*
- * MPSAFE
- */
-int
-kern_mlock(td, addr, size)
-	struct thread *td;
-	vm_offset_t addr;
-	vm_size_t size;
-{
-	vm_size_t pageoff;
-	struct proc *proc = td->td_proc;
-	int error;
-
-	pageoff = (addr & PAGE_MASK);
-	addr -= pageoff;
-	size += pageoff;
-	size = (vm_size_t) round_page(size);
+	addr = (vm_offset_t)uap->addr;
+	size = uap->len;
+	start = trunc_page(addr);
+	end = round_page(addr + size);

 	/* disable wrap around */
-	if (addr + size < addr)
+	if (end <= start)
 		return (EINVAL);

-	if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
-		return (EAGAIN);
+	npages = atop(end - start);
+
+	if (npages > vm_page_max_wired)
+		return (ENOMEM);

 	PROC_LOCK(proc);
-	if (size + ptoa(pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
-	    lim_cur(proc, RLIMIT_MEMLOCK)) {
+	if (npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)) >
+	    atop(lim_cur(proc, RLIMIT_MEMLOCK))) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);

-	error = vm_map_wire(&proc->p_vmspace->vm_map, addr,
-		     addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+	if (npages + cnt.v_wire_count > vm_page_max_wired)
+		return (EAGAIN);
+
+	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
+	     VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }

_at__at_ -1050,37 +1043,29 _at__at_
 	struct thread *td;
 	struct munlock_args *uap;
 {
+	vm_offset_t addr, start, end;
+	vm_size_t size;
 	int error;

 	error = suser(td);
 	if (error)
 		return (error);
-	return (kern_munlock(td, (vm_offset_t)uap->addr, (vm_size_t)uap->len));
-}

-/*
- * MPSAFE
- */
-int
-kern_munlock(td, addr, size)
-	struct thread *td;
-	vm_offset_t addr;
-	vm_size_t size;
-{
-	vm_size_t pageoff;
-	int error;
-
-	pageoff = (addr & PAGE_MASK);
-	addr -= pageoff;
-	size += pageoff;
-	size = (vm_size_t) round_page(size);
+	addr = (vm_offset_t)uap->addr;
+	size = uap->len;
+	start = trunc_page(addr);
+	end = round_page(addr + size);

 	/* disable wrap around */
-	if (addr + size < addr)
+	if (end <= start)
 		return (EINVAL);

-	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
-		     addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+	error = suser(td);
+	if (error)
+		return (error);
+
+	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
+	     VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }