Re: Panic in scheduler code with SCHED_ULE during boot to multi-user.

From: Jeff Roberson <jroberson_at_chesapeake.net>
Date: Fri, 4 Jul 2003 04:55:42 -0400 (EDT)
Excellent debugging.  Can you tell me whats in kseq_cpu[0] and
kseq_cpu[1]?  I think I may know what the problem is.  Do you have some
negative niced processes or some positive nice processes?


On Thu, 3 Jul 2003, Richard Todd wrote:

> Hi.  Last night I upgraded to the most recent -current source and
> rebuilt everything, and decided on building the kernel to try the new
> SCHED_ULE scheduler (I had been using SCHED_4BSD before).  Alas, the
> experiment did not go well; every time I booted the machine, I got a
> panic just as the system was about to put up the login prompt.
> Switching the kernel config back to SCHED_4BSD and building a kernel
> with the same (last night's) sources gave me a working kernel.  This
> is on a dual-processor PII/400 box.  Below I list what I've got from a
> kernel coredump of the SCHED_ULE kernel; I've added my comments on the gdb
> listing preceded by "#" signs.
>
> ichotolot# gdb -k ./kernel.debug ./vmcore.45
> GNU gdb 5.2.1 (FreeBSD)
> Copyright 2002 Free Software Foundation, Inc.
> GDB is free software, covered by the GNU General Public License, and you are
> welcome to change it and/or distribute copies of it under certain conditions.
> Type "show copying" to see the conditions.
> There is absolutely no warranty for GDB.  Type "show warranty" for details.
> This GDB was configured as "i386-undermydesk-freebsd"...
> panic: page fault
> panic messages:
> ---
> Fatal trap 12: page fault while in kernel mode
> cpuid = 1; lapic.id = 01000000
> fault virtual address	= 0x38
> fault code		= supervisor read, page not present
> instruction pointer	= 0x8:0xc036835d
> stack pointer	        = 0x10:0xe1cfbbbc
> frame pointer	        = 0x10:0xe1cfbbcc
> code segment		= base 0x0, limit 0xfffff, type 0x1b
> 			= DPL 0, pres 1, def32 1, gran 1
> processor eflags	= interrupt enabled, resume, IOPL = 0
> current process		= 649 (squid)
> trap number		= 12
> panic: page fault
> cpuid = 1; lapic.id = 01000000
> boot() called on cpu#1
>
> syncing disks, buffers remaining... panic: absolutely cannot call smp_ipi_shootdown with interrupts already disabled
> cpuid = 1; lapic.id = 01000000
> boot() called on cpu#1
> Uptime: 1m12s
> Dumping 638 MB
>  16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 256 272 288 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576 592 608 624
> ---
> Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug...done.
> Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug
> Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug...done.
> Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug
> Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug...done.
> Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug
> Reading symbols from /boot/kernel/green_saver.ko...done.
> Loaded symbols for /boot/kernel/green_saver.ko
> #0  doadump () at ../../../kern/kern_shutdown.c:240
> 240		dumping++;
> (kgdb) bt
> #0  doadump () at ../../../kern/kern_shutdown.c:240
> #1  0xc03547c0 in boot (howto=260) at ../../../kern/kern_shutdown.c:372
> #2  0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
> #3  0xc050f9db in smp_tlb_shootdown (vector=0, addr1=0, addr2=0)
>     at ../../../i386/i386/mp_machdep.c:2387
> #4  0xc050fc79 in smp_invlpg_range (addr1=0, addr2=0)
>     at ../../../i386/i386/mp_machdep.c:2519
> #5  0xc0511df8 in pmap_invalidate_range (pmap=0xc06dc620, sva=3568271360,
>     eva=1) at ../../../i386/i386/pmap.c:719
> #6  0xc0512118 in pmap_qenter (sva=3568271360, m=0xe1cfb8c0, count=-1)
>     at ../../../i386/i386/pmap.c:943
> #7  0xc03a0448 in vm_hold_load_pages (bp=0xd199d440, from=3568271360,
>     to=3568279552) at ../../../kern/vfs_bio.c:3574
> #8  0xc039ea5c in allocbuf (bp=0xd199d440, size=6144)
>     at ../../../kern/vfs_bio.c:2752
> #9  0xc039e6fe in geteblk (size=6144) at ../../../kern/vfs_bio.c:2634
> #10 0xc039b210 in bwrite (bp=0xd188b8e0) at ../../../kern/vfs_bio.c:818
> #11 0xc039bc6c in bawrite (bp=0x0) at ../../../kern/vfs_bio.c:1153
> #12 0xc03a4860 in vop_stdfsync (ap=0xe1cfba14)
>     at ../../../kern/vfs_default.c:742
> #13 0xc031ba10 in spec_fsync (ap=0xe1cfba14)
>     at ../../../fs/specfs/spec_vnops.c:417
> #14 0xc031ae38 in spec_vnoperate (ap=0x0)
>     at ../../../fs/specfs/spec_vnops.c:122
> #15 0xc04ad2d7 in ffs_sync (mp=0xc5336400, waitfor=2, cred=0xc1c27e80,
>     td=0xc0669ec0) at vnode_if.h:624
> #16 0xc03b0b1b in sync (td=0xc0669ec0, uap=0x0)
>     at ../../../kern/vfs_syscalls.c:142
> #17 0xc03542e2 in boot (howto=256) at ../../../kern/kern_shutdown.c:281
> #18 0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
> #19 0xc0517292 in trap_fatal (frame=0xe1cfbb7c, eva=0)
>     at ../../../i386/i386/trap.c:836
> #20 0xc0516863 in trap (frame=
>       {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
>     at ../../../i386/i386/trap.c:256
> #21 0xc04ff988 in calltrap () at {standard input}:97
> #22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
> #23 0xc035a636 in choosethread () at ../../../kern/kern_switch.c:140
> #24 0xc035c340 in mi_switch () at ../../../kern/kern_synch.c:524
> #25 0xc034b686 in _mtx_lock_sleep (m=0xc066e680, opts=0,
>     file=0xc058251d "../../../kern/kern_descrip.c", line=826)
>     at ../../../kern/kern_mutex.c:641
> #26 0xc034b0f8 in _mtx_lock_flags (m=0xc066e680, opts=0,
>     file=0xc058251d "../../../kern/kern_descrip.c", line=826)
>     at ../../../kern/kern_mutex.c:331
> #27 0xc0335e11 in close (td=0xc5757130, uap=0x0)
>     at ../../../kern/kern_descrip.c:826
> #28 0xc05175ee in syscall (frame=
>       {tf_fs = 47, tf_es = 47, tf_ds = 47, tf_edi = -1077936744, tf_esi = 0, tf_ebp = -1077936840, tf_isp = -506479244, tf_ebx = 689, tf_edx = -1, tf_ecx = 9, tf_eax = 6, tf_trapno = 12, tf_err = 2, tf_eip = 672626595, tf_cs = 31, tf_eflags = 663, tf_esp = -1077936900, tf_ss = 47}) at ../../../i386/i386/trap.c:1023
> #29 0xc04ff9dd in Xint0x80_syscall () at {standard input}:139
> ---Can't read userspace from dump, or kernel process---
> # The initial panic seems to be around frame 22, so let's go there.
> (kgdb) fr 22
> #22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
> 1164			kseq_move(kseq, PCPU_GET(cpuid));
> (kgdb) p kseq
> $1 = (struct kseq *) 0x0
> (kgdb) l 1160
> 1155			 */
> 1156			if ((kseq = kseq_load_highest()) == NULL)
> 1157				return (NULL);
> 1158
> 1159			/*
> 1160			 * Remove this kse from this kseq and runq and then requeue
> 1161			 * on the current processor.  Then we will dequeue it
> 1162			 * normally above.
> 1163			 */
> 1164			kseq_move(kseq, PCPU_GET(cpuid));
> # Hmm, that's odd.  You'd think that if kseq returned by kseq_load_highest()
> # was null, line 1157 would have returned and we'd never hit line 1164.
> # Weird.  Could be gdb is just being confused here, though.  Hard to say.
> # Anyway, on further inspection, it seems the fatal trap wasn't in
> # sched_choose() at all, but elsewhere; look at the eip in the trap frame:
> (kgdb) fr 20
> #20 0xc0516863 in trap (frame=
>       {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
>     at ../../../i386/i386/trap.c:256
> 256				trap_fatal(&frame, eva);
> (kgdb) p/x frame.tf_eip
> $2 = 0xc036835d
> (kgdb) disass 0xc036835d
> Dump of assembler code for function kseq_move:
> 0xc0368340 <kseq_move>:	push   %ebp
> 0xc0368341 <kseq_move+1>:	mov    %esp,%ebp
> 0xc0368343 <kseq_move+3>:	sub    $0x10,%esp
> 0xc0368346 <kseq_move+6>:	mov    %ebx,0xfffffff8(%ebp)
> 0xc0368349 <kseq_move+9>:	mov    %esi,0xfffffffc(%ebp)
> 0xc036834c <kseq_move+12>:	mov    0x8(%ebp),%esi
> 0xc036834f <kseq_move+15>:	mov    %esi,(%esp,1)
> 0xc0368352 <kseq_move+18>:	call   0xc03683a0 <kseq_choose>
> 0xc0368357 <kseq_move+23>:	mov    %eax,%ebx
> 0xc0368359 <kseq_move+25>:	mov    %eax,0x4(%esp,1)
> 0xc036835d <kseq_move+29>:	mov    0x38(%eax),%eax
> 0xc0368360 <kseq_move+32>:	mov    0x4(%eax),%eax
> 0xc0368363 <kseq_move+35>:	mov    %eax,(%esp,1)
> 0xc0368366 <kseq_move+38>:	call   0xc035ac50 <runq_remove>
> 0xc036836b <kseq_move+43>:	movl   $0x4,0x30(%ebx)
> 0xc0368372 <kseq_move+50>:	mov    %ebx,0x4(%esp,1)
> 0xc0368376 <kseq_move+54>:	mov    %esi,(%esp,1)
> 0xc0368379 <kseq_move+57>:	call   0xc0367f30 <kseq_rem>
> 0xc036837e <kseq_move+62>:	mov    0x38(%ebx),%edx
> 0xc0368381 <kseq_move+65>:	mov    0xc(%ebp),%eax
> 0xc0368384 <kseq_move+68>:	mov    %al,0x14(%edx)
> 0xc0368387 <kseq_move+71>:	mov    %ebx,(%esp,1)
> 0xc036838a <kseq_move+74>:	call   0xc0369490 <sched_add>
> 0xc036838f <kseq_move+79>:	mov    0xfffffff8(%ebp),%ebx
> 0xc0368392 <kseq_move+82>:	mov    0xfffffffc(%ebp),%esi
> 0xc0368395 <kseq_move+85>:	mov    %ebp,%esp
> 0xc0368397 <kseq_move+87>:	pop    %ebp
> 0xc0368398 <kseq_move+88>:	ret
> End of assembler dump.
> # Aha, the trap occured right at this instruction:
> # 0xc036835d <kseq_move+29>:	mov    0x38(%eax),%eax
> # Note from the trapframe that eax was 0 at that point, hence we're
> # dereferencing a NULL pointer.  Apparently this NULL was returned by
> # kseq_choose, and the actual null deref is the "ke->ke_runq" at line 448:
> (kgdb) l kseq_move
> 439             return (NULL);
> 440     }
> 441
> 442     void
> 443     kseq_move(struct kseq *from, int cpu)
> 444     {
> 445             struct kse *ke;
> 446
> 447             ke = kseq_choose(from);
> 448             runq_remove(ke->ke_runq, ke);
> (kgdb) l
> 449             ke->ke_state = KES_THREAD;
> 450             kseq_rem(from, ke);
> 451
> 452             ke->ke_cpu = cpu;
> 453             sched_add(ke);
> 454     }
> 455     #endif
> 456
> 457     struct kse *
> 458     kseq_choose(struct kseq *kseq)
> (kgdb) q
> ichotolot#
>
> So that's what I found.  Any suggestions on how to track this down further?
> _______________________________________________
> freebsd-current_at_freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "freebsd-current-unsubscribe_at_freebsd.org"
>
Received on Thu Jul 03 2003 - 23:55:55 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:13 UTC