Panic in scheduler code with SCHED_ULE during boot to multi-user.

From: Richard Todd <rmtodd_at_servalan.servalan.com> Date: Thu, 03 Jul 2003 23:58:06 -0500 · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:13 UTC

Hi.  Last night I upgraded to the most recent -current source and
rebuilt everything, and decided on building the kernel to try the new
SCHED_ULE scheduler (I had been using SCHED_4BSD before).  Alas, the
experiment did not go well; every time I booted the machine, I got a
panic just as the system was about to put up the login prompt.
Switching the kernel config back to SCHED_4BSD and building a kernel
with the same (last night's) sources gave me a working kernel.  This
is on a dual-processor PII/400 box.  Below I list what I've got from a
kernel coredump of the SCHED_ULE kernel; I've added my comments on the gdb
listing preceded by "#" signs.

ichotolot# gdb -k ./kernel.debug ./vmcore.45
GNU gdb 5.2.1 (FreeBSD)
Copyright 2002 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for details.
This GDB was configured as "i386-undermydesk-freebsd"...
panic: page fault
panic messages:
---
Fatal trap 12: page fault while in kernel mode
cpuid = 1; lapic.id = 01000000
fault virtual address	= 0x38
fault code		= supervisor read, page not present
instruction pointer	= 0x8:0xc036835d
stack pointer	        = 0x10:0xe1cfbbbc
frame pointer	        = 0x10:0xe1cfbbcc
code segment		= base 0x0, limit 0xfffff, type 0x1b
			= DPL 0, pres 1, def32 1, gran 1
processor eflags	= interrupt enabled, resume, IOPL = 0
current process		= 649 (squid)
trap number		= 12
panic: page fault
cpuid = 1; lapic.id = 01000000
boot() called on cpu#1

syncing disks, buffers remaining... panic: absolutely cannot call smp_ipi_shootdown with interrupts already disabled
cpuid = 1; lapic.id = 01000000
boot() called on cpu#1
Uptime: 1m12s
Dumping 638 MB
 16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 256 272 288 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576 592 608 624
---
Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug...done.
Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug
Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug...done.
Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug
Reading symbols from /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug...done.
Loaded symbols for /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug
Reading symbols from /boot/kernel/green_saver.ko...done.
Loaded symbols for /boot/kernel/green_saver.ko
#0  doadump () at ../../../kern/kern_shutdown.c:240
240		dumping++;
(kgdb) bt
#0  doadump () at ../../../kern/kern_shutdown.c:240
#1  0xc03547c0 in boot (howto=260) at ../../../kern/kern_shutdown.c:372
#2  0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
#3  0xc050f9db in smp_tlb_shootdown (vector=0, addr1=0, addr2=0)
    at ../../../i386/i386/mp_machdep.c:2387
#4  0xc050fc79 in smp_invlpg_range (addr1=0, addr2=0)
    at ../../../i386/i386/mp_machdep.c:2519
#5  0xc0511df8 in pmap_invalidate_range (pmap=0xc06dc620, sva=3568271360, 
    eva=1) at ../../../i386/i386/pmap.c:719
#6  0xc0512118 in pmap_qenter (sva=3568271360, m=0xe1cfb8c0, count=-1)
    at ../../../i386/i386/pmap.c:943
#7  0xc03a0448 in vm_hold_load_pages (bp=0xd199d440, from=3568271360, 
    to=3568279552) at ../../../kern/vfs_bio.c:3574
#8  0xc039ea5c in allocbuf (bp=0xd199d440, size=6144)
    at ../../../kern/vfs_bio.c:2752
#9  0xc039e6fe in geteblk (size=6144) at ../../../kern/vfs_bio.c:2634
#10 0xc039b210 in bwrite (bp=0xd188b8e0) at ../../../kern/vfs_bio.c:818
#11 0xc039bc6c in bawrite (bp=0x0) at ../../../kern/vfs_bio.c:1153
#12 0xc03a4860 in vop_stdfsync (ap=0xe1cfba14)
    at ../../../kern/vfs_default.c:742
#13 0xc031ba10 in spec_fsync (ap=0xe1cfba14)
    at ../../../fs/specfs/spec_vnops.c:417
#14 0xc031ae38 in spec_vnoperate (ap=0x0)
    at ../../../fs/specfs/spec_vnops.c:122
#15 0xc04ad2d7 in ffs_sync (mp=0xc5336400, waitfor=2, cred=0xc1c27e80, 
    td=0xc0669ec0) at vnode_if.h:624
#16 0xc03b0b1b in sync (td=0xc0669ec0, uap=0x0)
    at ../../../kern/vfs_syscalls.c:142
#17 0xc03542e2 in boot (howto=256) at ../../../kern/kern_shutdown.c:281
#18 0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
#19 0xc0517292 in trap_fatal (frame=0xe1cfbb7c, eva=0)
    at ../../../i386/i386/trap.c:836
#20 0xc0516863 in trap (frame=
      {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
    at ../../../i386/i386/trap.c:256
#21 0xc04ff988 in calltrap () at {standard input}:97
#22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
#23 0xc035a636 in choosethread () at ../../../kern/kern_switch.c:140
#24 0xc035c340 in mi_switch () at ../../../kern/kern_synch.c:524
#25 0xc034b686 in _mtx_lock_sleep (m=0xc066e680, opts=0, 
    file=0xc058251d "../../../kern/kern_descrip.c", line=826)
    at ../../../kern/kern_mutex.c:641
#26 0xc034b0f8 in _mtx_lock_flags (m=0xc066e680, opts=0, 
    file=0xc058251d "../../../kern/kern_descrip.c", line=826)
    at ../../../kern/kern_mutex.c:331
#27 0xc0335e11 in close (td=0xc5757130, uap=0x0)
    at ../../../kern/kern_descrip.c:826
#28 0xc05175ee in syscall (frame=
      {tf_fs = 47, tf_es = 47, tf_ds = 47, tf_edi = -1077936744, tf_esi = 0, tf_ebp = -1077936840, tf_isp = -506479244, tf_ebx = 689, tf_edx = -1, tf_ecx = 9, tf_eax = 6, tf_trapno = 12, tf_err = 2, tf_eip = 672626595, tf_cs = 31, tf_eflags = 663, tf_esp = -1077936900, tf_ss = 47}) at ../../../i386/i386/trap.c:1023
#29 0xc04ff9dd in Xint0x80_syscall () at {standard input}:139
---Can't read userspace from dump, or kernel process---
# The initial panic seems to be around frame 22, so let's go there.
(kgdb) fr 22
#22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
1164			kseq_move(kseq, PCPU_GET(cpuid));
(kgdb) p kseq
$1 = (struct kseq *) 0x0
(kgdb) l 1160
1155			 */
1156			if ((kseq = kseq_load_highest()) == NULL)
1157				return (NULL);
1158	
1159			/*
1160			 * Remove this kse from this kseq and runq and then requeue
1161			 * on the current processor.  Then we will dequeue it
1162			 * normally above.
1163			 */
1164			kseq_move(kseq, PCPU_GET(cpuid));
# Hmm, that's odd.  You'd think that if kseq returned by kseq_load_highest() 
# was null, line 1157 would have returned and we'd never hit line 1164.
# Weird.  Could be gdb is just being confused here, though.  Hard to say.
# Anyway, on further inspection, it seems the fatal trap wasn't in 
# sched_choose() at all, but elsewhere; look at the eip in the trap frame:
(kgdb) fr 20
#20 0xc0516863 in trap (frame=
      {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
    at ../../../i386/i386/trap.c:256
256				trap_fatal(&frame, eva);
(kgdb) p/x frame.tf_eip
$2 = 0xc036835d
(kgdb) disass 0xc036835d
Dump of assembler code for function kseq_move:
0xc0368340 <kseq_move>:	push   %ebp
0xc0368341 <kseq_move+1>:	mov    %esp,%ebp
0xc0368343 <kseq_move+3>:	sub    $0x10,%esp
0xc0368346 <kseq_move+6>:	mov    %ebx,0xfffffff8(%ebp)
0xc0368349 <kseq_move+9>:	mov    %esi,0xfffffffc(%ebp)
0xc036834c <kseq_move+12>:	mov    0x8(%ebp),%esi
0xc036834f <kseq_move+15>:	mov    %esi,(%esp,1)
0xc0368352 <kseq_move+18>:	call   0xc03683a0 <kseq_choose>
0xc0368357 <kseq_move+23>:	mov    %eax,%ebx
0xc0368359 <kseq_move+25>:	mov    %eax,0x4(%esp,1)
0xc036835d <kseq_move+29>:	mov    0x38(%eax),%eax
0xc0368360 <kseq_move+32>:	mov    0x4(%eax),%eax
0xc0368363 <kseq_move+35>:	mov    %eax,(%esp,1)
0xc0368366 <kseq_move+38>:	call   0xc035ac50 <runq_remove>
0xc036836b <kseq_move+43>:	movl   $0x4,0x30(%ebx)
0xc0368372 <kseq_move+50>:	mov    %ebx,0x4(%esp,1)
0xc0368376 <kseq_move+54>:	mov    %esi,(%esp,1)
0xc0368379 <kseq_move+57>:	call   0xc0367f30 <kseq_rem>
0xc036837e <kseq_move+62>:	mov    0x38(%ebx),%edx
0xc0368381 <kseq_move+65>:	mov    0xc(%ebp),%eax
0xc0368384 <kseq_move+68>:	mov    %al,0x14(%edx)
0xc0368387 <kseq_move+71>:	mov    %ebx,(%esp,1)
0xc036838a <kseq_move+74>:	call   0xc0369490 <sched_add>
0xc036838f <kseq_move+79>:	mov    0xfffffff8(%ebp),%ebx
0xc0368392 <kseq_move+82>:	mov    0xfffffffc(%ebp),%esi
0xc0368395 <kseq_move+85>:	mov    %ebp,%esp
0xc0368397 <kseq_move+87>:	pop    %ebp
0xc0368398 <kseq_move+88>:	ret    
End of assembler dump.
# Aha, the trap occured right at this instruction:
# 0xc036835d <kseq_move+29>:	mov    0x38(%eax),%eax
# Note from the trapframe that eax was 0 at that point, hence we're 
# dereferencing a NULL pointer.  Apparently this NULL was returned by
# kseq_choose, and the actual null deref is the "ke->ke_runq" at line 448:
(kgdb) l kseq_move
439             return (NULL);
440     }
441
442     void
443     kseq_move(struct kseq *from, int cpu)
444     {
445             struct kse *ke;
446
447             ke = kseq_choose(from);
448             runq_remove(ke->ke_runq, ke);
(kgdb) l
449             ke->ke_state = KES_THREAD;
450             kseq_rem(from, ke);
451
452             ke->ke_cpu = cpu;
453             sched_add(ke);
454     }
455     #endif
456
457     struct kse *
458     kseq_choose(struct kseq *kseq)
(kgdb) q
ichotolot# 

So that's what I found.  Any suggestions on how to track this down further?