LOR and boot time lockups

From: Ian FREISLICH <if_at_hetzner.co.za>
Date: Fri, 21 Jan 2005 09:33:13 +0200
Hi

I've been unable to successfully boot a kernel since Jan 19.  The
offending change happened between Jan 10 and Jan 19.  I will do the
binary search for the change if nobody has any ideas, but I want to
ask first; it takes little over an hour to compile a kernel on this
system.

It fails at the ATA probe, or if I remove ata from the kernel, the
last message is from somewhere in the SCSI probe: "Waiting 15 seconds
for SCSI devices to settle".  I can't reasonably remove the SCSI
stuff from the kernel to see what happens next because the ufs root
is on a SCSI disk.

At least the debugger is avalable.  I don't really know what is
useful information so here's what I can think of off the top of my
head.

Timecounters tick every 1.000 msec
ipfw2 initialized, divert loadable, rule-based forwarding enabled, default to deny, logging unlimited
Waiting 15 seconds for SCSI devices to settle
*** Waited 1 minute then sent a line break***
KDB: enter: Line break on console
[thread pid 45 tid 100061 ]
Stopped at      kdb_enter+0x2b: nop     
	db> ce
Tracing pid 45 tid 100061 td 0xc10b8170
kdb_enter(c062c0eb) at kdb_enter+0x2b
siointr1(c10d8400) at siointr1+0xce
siointr(c10d8400) at siointr+0x5e
intr_execute_handlers(c1018090,d0e98cdc,4,d0e98d28,c05d0633) at intr_execute_han
dlers+0x7d
lapic_handle_intr(34) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc04dd1aa, esp = 0xd0e98d20, ebp = 0xd0e98d28 ---
critical_exit(c10afa80,0,c05d02dc,c04b639c,c10afa80) at critical_exit+0xd2
fork_exit(c04b639c,c10afa80,d0e98d48) at fork_exit+0x6c
fork_trampoline() at fork_trampoline+0x8
--- trap 0x1, eip = 0, esp = 0xd0e98d7c, ebp = 0 ---
db> sho pcpu 0
cpuid        = 0
curthread    = 0xc10b8170: pid 45 "swi0: sio"
curpcb       = 0xd0e98da0
fpcurthread  = none
idlethread   = 0xc0fe1730: pid 12 "idle: cpu0"
APIC ID      = 0
currentldt   = 0x28
db> sho pcpu 1
cpuid        = 1
curthread    = 0xc0fe15c0: pid 11 "idle: cpu1"
curpcb       = 0
fpcurthread  = none
idlethread   = 0xc0fe15c0: pid 11 "idle: cpu1"
APIC ID      = 1
currentldt   = 0x28
db> ps
  pid   proc     uid  ppid  pgrp  flag   stat  wmesg    wchan  cmd
   45 c10b59d8    0     0     0 0000204 [CPU 0] swi0: sio
    8 c10b5bd0    0     0     0 0000204 [RUNQ] aic_recovery0
    7 c102bbd0    0     0     0 0000204 [RUNQ] aic_recovery0
   44 c102bdc8    0     0     0 0000204 [RUNQ] usbtask
   43 c103d000    0     0     0 0000204 [RUNQ] usb0
    6 c103d1f8    0     0     0 0000204 [RUNQ] kqueue taskq
   42 c103d3f0    0     0     0 0000204 [IWAIT] swi2: cambio
   41 c103d5e8    0     0     0 0000204 [IWAIT] swi5:+
    5 c103d7e0    0     0     0 0000204 [RUNQ] thread taskq
   40 c103d9d8    0     0     0 0000204 [IWAIT] swi6:+
   39 c103dbd0    0     0     0 0000204 [IWAIT] swi6: task queue
   38 c103ddc8    0     0     0 0000204 [RUNQ] yarrow
    4 c103f000    0     0     0 0000204 [RUNQ] g_down
    3 c103f1f8    0     0     0 0000204 [RUNQ] g_up
    2 c10225e8    0     0     0 0000204 [RUNQ] g_event
   37 c10227e0    0     0     0 0000204 [IWAIT] swi1: net
   36 c10229d8    0     0     0 0000204 [IWAIT] swi3: vm
   35 c1022bd0    0     0     0 000020c [RUNQ] swi4: clock sio
   34 c1022dc8    0     0     0 0000204 [IWAIT] irq0: clk
   33 c102b000    0     0     0 0000204 [IWAIT] irq22:
   32 c102b1f8    0     0     0 0000204 [IWAIT] irq21:
   31 c102b3f0    0     0     0 0000204 [IWAIT] irq20:
   30 c102b5e8    0     0     0 0000204 [IWAIT] irq19:
   29 c102b7e0    0     0     0 0000204 [IWAIT] irq18:
   28 c102b9d8    0     0     0 0000204 [IWAIT] irq17:
   27 c0fea1f8    0     0     0 0000204 [IWAIT] irq16: xl0 ahc0
   26 c0fea3f0    0     0     0 0000204 [IWAIT] irq15:
   25 c0fea5e8    0     0     0 0000204 [IWAIT] irq14:
   24 c0fea7e0    0     0     0 0000204 [IWAIT] irq13:
   23 c0fea9d8    0     0     0 0000204 [IWAIT] irq12:
   22 c0feabd0    0     0     0 0000204 [IWAIT] irq11: uhci0
   21 c0feadc8    0     0     0 0000204 [IWAIT] irq10:
   20 c1022000    0     0     0 0000204 [IWAIT] irq9: intpm0
   19 c10221f8    0     0     0 0000204 [IWAIT] irq8: rtc
   18 c10223f0    0     0     0 0000204 [IWAIT] irq7: ppc0
   17 c0fe0000    0     0     0 0000204 [IWAIT] irq6:
   16 c0fe01f8    0     0     0 0000204 [IWAIT] irq5:
   15 c0fe03f0    0     0     0 0000204 [IWAIT] irq4: sio0
   14 c0fe05e8    0     0     0 0000204 [IWAIT] irq3: sio1
   13 c0fe07e0    0     0     0 0000204 [IWAIT] irq1: atkbd0
   12 c0fe09d8    0     0     0 000020c [Can run] idle: cpu0
   11 c0fe0bd0    0     0     0 000020c [Can run] idle: cpu1
    1 c0fe0dc8    0     0     0 0000200 [INACTIVE] swapper
   10 c0fea000    0     0     0 0000204 [RUNQ] ktrace
    0 c0667980    0     0     0 0000200 [SLPQ conifhk 0xc0648f6c][SLP] swapper


I then recompiled with witness and invarients and got the following
LOR earlier in the boot process.  I'm not sure if it's related, but
I couldn't find a reference it this particular LOR at
http://sources.zabbadoz.net/freebsd/lor.html.

lock order reversal
 1st 0xc0c64968 32 (UMA zone) _at_ /usr/src/sys/vm/uma_core.c:1842
 2nd 0xc0c59084 system map (system map) _at_ /usr/src/sys/vm/vm_map.c:2998
KDB: stack backtrace:
kdb_backtrace(0,ffffffff,c0672c88,c0672b70,c063dee8) at kdb_backtrace+0x29
witness_checkorder(c0c59084,9,c061f340,bb6) at witness_checkorder+0x54c
_mtx_lock_flags(c0c59084,0,c061f340,bb6) at _mtx_lock_flags+0x5b
_vm_map_lock_read(c0c59000,c061f340,bb6,200007d,c) at _vm_map_lock_read+0x24
vm_map_lookup(c0820ad8,c7057000,2,c0820adc,c0820acc) at vm_map_lookup+0x28
vm_fault(c0c59000,c7057000,2,0,c065fea0) at vm_fault+0x66
trap_pfault(c0820ba0,0,c7057a69) at trap_pfault+0x11b
trap(c0820018,c04e0010,c0610010,c0fd8620,c0c52420) at trap+0x30d
calltrap() at calltrap+0x5
--- trap 0xc, eip = 0xc0595100, esp = 0xc0820be0, ebp = 0xc0820bec ---
uma_dbg_alloc(c0c52420,0,c0fd8620,c0c64968,0) at uma_dbg_alloc+0x54
uma_zalloc_arg(c0c52420,0,1) at uma_zalloc_arg+0xfe
malloc(1c,c0640820,1,4,0) at malloc+0xae
resource_list_add(c10ad880,4,0,378,37f) at resource_list_add+0x2e
isa_set_resource(c1042580,c1095e00,4,0,378,8) at isa_set_resource+0x8d
bus_set_resource(c1095e00,4,0,378,8) at bus_set_resource+0x67
ppc_probe(c1095e00,c1095e00,c0fc7270,c1095e00,c0820d30) at ppc_probe+0xd7
ppc_isa_probe(c1095e00) at ppc_isa_probe+0x72
device_probe_child(c1042580,c1095e00,c10af000,c1095e00,1d) at device_probe_child
+0xf8
device_probe_and_attach(c1095e00) at device_probe_and_attach+0x7d
isa_probe_children(c1042580) at isa_probe_children+0x103
configure(0,81ec00,81e000,0,c042da65) at configure+0x32
mi_startup() at mi_startup+0x96
begin() at begin+0x2c


Fatal trap 12: page fault while in kernel mode
cpuid = 0; apic id = 00
fault virtual address   = 0xc7057a69
fault code              = supervisor write, page not present
instruction pointer     = 0x8:0xc0595100
stack pointer           = 0x10:0xc0820be0
frame pointer           = 0x10:0xc0820bec
code segment            = base 0x0, limit 0xfffff, type 0x1b
                        = DPL 0, pres 1, def32 1, gran 1
processor eflags        = interrupt enabled, resume, IOPL = 0
current process         = 0 (swapper)
[thread pid 0 tid 0 ]
Stopped at      uma_dbg_alloc+0x54:     movb    $0xff,0x18(%eax,%ecx,1)
db> sho proc
p = 0xc065fca0, vmspace = 0xc0660000, map = 0xc0660000, pmap = 0xc06600c0
Task map 0xc0660000: pmap=0xc06600c0, nentries=0, version=1
db> ps
  pid   proc     uid  ppid  pgrp  flag   stat  wmesg    wchan  cmd
    8 c10b5bd0    0     0     0 0000204 [RUNQ] aic_recovery0
    7 c102bbd0    0     0     0 0000204 [RUNQ] aic_recovery0
   44 c102bdc8    0     0     0 0000204 [RUNQ] usbtask
   43 c103d000    0     0     0 0000204 [RUNQ] usb0
   42 c103d1f8    0     0     0 0000204 [IWAIT] swi2: cambio
   41 c103d3f0    0     0     0 0000204 [IWAIT] swi5:+
    6 c103d5e8    0     0     0 0000204 [RUNQ] thread taskq
   40 c103d7e0    0     0     0 0000204 [IWAIT] swi6:+
   39 c103d9d8    0     0     0 0000204 [IWAIT] swi6: task queue
    5 c103dbd0    0     0     0 0000204 [RUNQ] kqueue taskq
   38 c103ddc8    0     0     0 0000204 [RUNQ] yarrow
    4 c103f000    0     0     0 0000204 [RUNQ] g_down
    3 c103f1f8    0     0     0 0000204 [RUNQ] g_up
    2 c10225e8    0     0     0 0000204 [RUNQ] g_event
   37 c10227e0    0     0     0 0000204 [IWAIT] swi1: net
   36 c10229d8    0     0     0 0000204 [IWAIT] swi3: vm
   35 c1022bd0    0     0     0 000020c [IWAIT] swi4: clock
   34 c1022dc8    0     0     0 0000204 [IWAIT] irq0:
   33 c102b000    0     0     0 0000204 [IWAIT] irq22:
   32 c102b1f8    0     0     0 0000204 [IWAIT] irq21:
   31 c102b3f0    0     0     0 0000204 [IWAIT] irq20:
   30 c102b5e8    0     0     0 0000204 [IWAIT] irq19:
   29 c102b7e0    0     0     0 0000204 [IWAIT] irq18:
   28 c102b9d8    0     0     0 0000204 [IWAIT] irq17:
   27 c0fea1f8    0     0     0 0000204 [IWAIT] irq16: xl0 ahc0
   26 c0fea3f0    0     0     0 0000204 [IWAIT] irq15:
   25 c0fea5e8    0     0     0 0000204 [IWAIT] irq14:
   24 c0fea7e0    0     0     0 0000204 [IWAIT] irq13:
   23 c0fea9d8    0     0     0 0000204 [IWAIT] irq12:
   22 c0feabd0    0     0     0 0000204 [RUNQ] irq11: uhci0
   21 c0feadc8    0     0     0 0000204 [IWAIT] irq10:
   20 c1022000    0     0     0 0000204 [IWAIT] irq9: intpm0
   19 c10221f8    0     0     0 0000204 [IWAIT] irq8:
   18 c10223f0    0     0     0 0000204 [IWAIT] irq7:
   17 c0fe0000    0     0     0 0000204 [IWAIT] irq6:
   16 c0fe01f8    0     0     0 0000204 [IWAIT] irq5:
   15 c0fe03f0    0     0     0 0000204 [IWAIT] irq4:
   14 c0fe05e8    0     0     0 0000204 [IWAIT] irq3:
   13 c0fe07e0    0     0     0 0000204 [IWAIT] irq1: atkbd0
   12 c0fe09d8    0     0     0 000020c [Can run] idle: cpu0
   11 c0fe0bd0    0     0     0 000020c [Can run] idle: cpu1
    1 c0fe0dc8    0     0     0 0000200 [INACTIVE] swapper
   10 c0fea000    0     0     0 0000204 [RUNQ] ktrace
    0 c065fca0    0     0     0 0000200 [CPU 0] swapper
db> show pcpu
cpuid        = 0
curthread    = 0xc065fea0: pid 0 "swapper"
curpcb       = 0xc0820da0
fpcurthread  = none
idlethread   = 0xc0fe1730: pid 12 "idle: cpu0"
APIC ID      = 0
currentldt   = 0x28
spin locks held:
db> show pcpu 1
cpuid        = 1
curthread    = 0xc0fe15c0: pid 11 "idle: cpu1"
curpcb       = 0
fpcurthread  = none
idlethread   = 0xc0fe15c0: pid 11 "idle: cpu1"
APIC ID      = 1
currentldt   = 0x28
spin locks held:


--
Ian Freislich
Received on Fri Jan 21 2005 - 06:33:17 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:38:26 UTC