On 2017-May-2, at 2:30 PM, Mark Millard <markmi at dsl-only.net> wrote: > On 2017-May-2, at 2:22 PM, Mark Millard <markmi at dsl-only.net> wrote: > >> It turns out that the bt's from the example panics are >> repeatable for the pc and lr sequence involved (but not >> the specific sp's and fp's involved). I report this in >> case it suggests anything. I'll note that the build had >> a production style kernel for a build of -r317015 . >> >> The first type of panic actually a back to back >> sequence of two bt's, this is the sleeping-thread type >> pf example. The second type is just one bt by itself. >> >> There is one variable lr in the bt for the sleeping-thread >> type of example (the first type of panic of the two shown >> later, the one with back-to-back bt's): >> >> 131,133c131,133 >> < handle_el0_sync() at 0x40040070 >> < pc = 0xffff0000006079e8 lr = 0x0000000040040070 >> < sp = 0xffff000065dfdba0 fp = 0x0000ffffffffeb00 >> --- >>> handle_el0_sync() at 0x40044490 >>> pc = 0xffff0000006079e8 lr = 0x0000000040044490 >>> sp = 0xffff000040229ba0 fp = 0x0000ffffffffe3d0 >> >> Otherwise the two bt's in the example match for the pc/lr >> sequence. >> >> I only have the two examples of this type to compare so >> far (one diff). >> >> I have 3 examples of the second type and they had no such >> variation. >> >> One thing in common to all 5 of these examples is the >> sequence: >> >> data_abort() at handle_el1h_sync+0x70 >> lr = 0xffff000000607870 >> handle_el1h_sync() at pmap_remove_pages+0x2a8 >> pc = 0xffff000000607870 lr = 0xffff0000006175d4 >> pmap_remove_pages() >> >> being involved in each example. >> >> >> I'm not saying that I can cause any panics at will, but >> when either of the two types happen the bt is (mostly) >> stable for the pc and lr sequence and that short >> sequence above is involved. >> >> I have seen one other type of panic but I did not manage >> to record a bt for it yet. It involved the instruction >> cache instead of arm64_dcache_wb_range . >> >> I quote the prior reported example bt's below. >> >> On 2017-May-2, at 5:24 AM, Mark Millard <markmi at dsl-only.net> wrote: >> >>> On 2017-May-2, at 3:37 AM, Mark Millard <markmi at dsl-only.net> wrote: >>> >>>> On 2017-May-2, at 2:53 AM, Mark Millard <markmi at dsl-only.net> wrote: >>>> >>>> . . . >>>> FYI: >>>> >>>> I do sometimes get things like: >>>> >>>> >>>> System shutdown time has arrived >>>> Apr 30 19:43:15 ODC2FBSD shutdown: power-down by root: >>>> Sleeping thread (tid 100093, pid 708) owns a non-sleepable lock >>>> KDB: stack backtrace of thread 100093: >>>> sched_switch() at mi_switch+0x100 >>>> pc = 0xffff000000347d44 lr = 0xffff000000327358 >>>> sp = 0xffff000040237e00 fp = 0xffff000040237e20 >>>> >>>> mi_switch() at sleepq_wait+0x3c >>>> pc = 0xffff000000327358 lr = 0xffff00000036c174 >>>> sp = 0xffff000040237e30 fp = 0xffff000040237e50 >>>> >>>> sleepq_wait() at _sleep+0x29c >>>> pc = 0xffff00000036c174 lr = 0xffff000000326c7c >>>> sp = 0xffff000040237e60 fp = 0xffff000040237ee0 >>>> >>>> _sleep() at vm_page_sleep_if_busy+0xb0 >>>> pc = 0xffff000000326c7c lr = 0xffff0000005cfcf4 >>>> sp = 0xffff000040237ef0 fp = 0xffff000040237f10 >>>> >>>> vm_page_sleep_if_busy() at vm_fault_hold+0xcc8 >>>> pc = 0xffff0000005cfcf4 lr = 0xffff0000005ba17c >>>> sp = 0xffff000040237f20 fp = 0xffff000040238070 >>>> >>>> vm_fault_hold() at vm_fault+0x70 >>>> pc = 0xffff0000005ba17c lr = 0xffff0000005b9464 >>>> sp = 0xffff000040238080 fp = 0xffff0000402380b0 >>>> >>>> vm_fault() at data_abort+0xe0 >>>> pc = 0xffff0000005b9464 lr = 0xffff00000061ad94 >>>> sp = 0xffff0000402380c0 fp = 0xffff000040238170 >>>> >>>> data_abort() at handle_el1h_sync+0x70 >>>> pc = 0xffff00000061ad94 lr = 0xffff000000607870 >>>> sp = 0xffff000040238180 fp = 0xffff000040238290 >>>> >>>> handle_el1h_sync() at pmap_enter+0x678 >>>> pc = 0xffff000000607870 lr = 0xffff000000615684 >>>> sp = 0xffff0000402382a0 fp = 0xffff0000402383b0 >>>> >>>> pmap_enter() at vm_fault_hold+0x17c0 >>>> pc = 0xffff000000615684 lr = 0xffff0000005bac74 >>>> sp = 0xffff0000402383c0 fp = 0xffff000040238510 >>>> >>>> vm_fault_hold() at vm_fault+0x70 >>>> pc = 0xffff0000005bac74 lr = 0xffff0000005b9464 >>>> sp = 0xffff000040238520 fp = 0xffff000040238550 >>>> >>>> vm_fault() at data_abort+0xe0 >>>> pc = 0xffff0000005b9464 lr = 0xffff00000061ad94 >>>> sp = 0xffff000040238560 fp = 0xffff000040238610 >>>> >>>> data_abort() at handle_el1h_sync+0x70 >>>> pc = 0xffff00000061ad94 lr = 0xffff000000607870 >>>> sp = 0xffff000040238620 fp = 0xffff000040238730 >>>> >>>> handle_el1h_sync() at pmap_remove_pages+0x2a8 >>>> pc = 0xffff000000607870 lr = 0xffff0000006175d4 >>>> sp = 0xffff000040238740 fp = 0xffff000040238870 >>>> >>>> pmap_remove_pages() at vmspace_exit+0xb0 >>>> pc = 0xffff0000006175d4 lr = 0xffff0000005c020c >>>> sp = 0xffff000040238880 fp = 0xffff0000402388b0 >>>> >>>> vmspace_exit() at exit1+0x604 >>>> pc = 0xffff0000005c020c lr = 0xffff0000002db5e0 >>>> sp = 0xffff0000402388c0 fp = 0xffff000040238920 >>>> >>>> exit1() at sys_sys_exit+0x10 >>>> pc = 0xffff0000002db5e0 lr = 0xffff0000002dafd8 >>>> sp = 0xffff000040238930 fp = 0xffff000040238930 >>>> >>>> sys_sys_exit() at do_el0_sync+0xa48 >>>> pc = 0xffff0000002dafd8 lr = 0xffff00000061b91c >>>> sp = 0xffff000040238940 fp = 0xffff000040238a70 >>>> >>>> do_el0_sync() at handle_el0_sync+0x6c >>>> pc = 0xffff00000061b91c lr = 0xffff0000006079e8 >>>> sp = 0xffff000040238a80 fp = 0xffff000040238b90 >>>> >>>> handle_el0_sync() at 0x38cc0 >>>> pc = 0xffff0000006079e8 lr = 0x0000000000038cc0 >>>> sp = 0xffff000040238ba0 fp = 0x0000ffffffffed00 >>>> >>>> panic: sleeping thread >>>> cpuid = 2 >>>> time = 1493581440 >>>> KDB: stack backtrace: >>>> db_trace_self() at db_trace_self_wrapper+0x28 >>>> pc = 0xffff000000605cc0 lr = 0xffff0000000869cc >>>> sp = 0xffff000065dfd320 fp = 0xffff000065dfd530 >>>> >>>> db_trace_self_wrapper() at vpanic+0x164 >>>> pc = 0xffff0000000869cc lr = 0xffff00000031d464 >>>> sp = 0xffff000065dfd540 fp = 0xffff000065dfd5b0 >>>> >>>> vpanic() at panic+0x4c >>>> pc = 0xffff00000031d464 lr = 0xffff00000031d2fc >>>> sp = 0xffff000065dfd5c0 fp = 0xffff000065dfd640 >>>> >>>> panic() at propagate_priority+0x2d0 >>>> pc = 0xffff00000031d2fc lr = 0xffff000000374558 >>>> sp = 0xffff000065dfd650 fp = 0xffff000065dfd690 >>>> >>>> propagate_priority() at turnstile_wait+0x340 >>>> pc = 0xffff000000374558 lr = 0xffff00000037503c >>>> sp = 0xffff000065dfd6a0 fp = 0xffff000065dfd6e0 >>>> >>>> turnstile_wait() at __rw_wlock_hard+0x208 >>>> pc = 0xffff00000037503c lr = 0xffff000000319138 >>>> sp = 0xffff000065dfd6f0 fp = 0xffff000065dfd770 >>>> >>>> __rw_wlock_hard() at pmap_enter+0xe98 >>>> pc = 0xffff000000319138 lr = 0xffff000000615ea4 >>>> sp = 0xffff000065dfd780 fp = 0xffff000065dfd810 >>>> >>>> pmap_enter() at vm_fault_hold+0x28c >>>> pc = 0xffff000000615ea4 lr = 0xffff0000005b9740 >>>> sp = 0xffff000065dfd820 fp = 0xffff000065dfd970 >>>> >>>> vm_fault_hold() at vm_fault+0x70 >>>> pc = 0xffff0000005b9740 lr = 0xffff0000005b9464 >>>> sp = 0xffff000065dfd980 fp = 0xffff000065dfd9b0 >>>> >>>> vm_fault() at data_abort+0xe0 >>>> pc = 0xffff0000005b9464 lr = 0xffff00000061ad94 >>>> sp = 0xffff000065dfd9c0 fp = 0xffff000065dfda70 >>>> >>>> data_abort() at handle_el0_sync+0x6c >>>> pc = 0xffff00000061ad94 lr = 0xffff0000006079e8 >>>> sp = 0xffff000065dfda80 fp = 0xffff000065dfdb90 >>>> >>>> handle_el0_sync() at 0x40040070 >>>> pc = 0xffff0000006079e8 lr = 0x0000000040040070 >>>> sp = 0xffff000065dfdba0 fp = 0x0000ffffffffeb00 >>>> >>>> KDB: enter: panic >>>> [ thread pid 709 tid 100086 ] >>>> Stopped at kdb_enter+0x44: undefined d4200000 >>>> db> >>> >>> Another example failure is: >>> >>> Fatal data abort: >>> x0: 400a9000 >>> x1: 1000 >>> x2: 0 >>> x3: 40 >>> x4: 3f >>> x5: fffffd00304e5000 >>> x6: 2b52 >>> x7: c >>> x8: b >>> x9: fffffd000076d5d0 >>> x10: 68 >>> x11: 40000000 >>> x12: 704c5000 >>> x13: 42b42003 >>> x14: 42b42003 >>> x15: 40000000 >>> x16: c >>> x17: ffffffffffffffff >>> x18: ffff000065dd5310 >>> x19: 800000000000000 >>> x20: 1 >>> x21: fffffd0002b43000 >>> x22: 12000004556478b >>> x23: f000000000000000 >>> x24: fffffd0002b41bc8 >>> x25: 40 >>> x26: fffffd0002b42548 >>> x27: 7b >>> x28: 3 >>> x29: ffff000065dd53c0 >>> sp: ffff000065dd5310 >>> lr: ffff0000006175d8 >>> elr: ffff00000060589c >>> spsr: 60000345 >>> far: 400a9000 >>> esr: 96000147 >>> [ thread pid 715 tid 100078 ] >>> Stopped at arm64_dcache_wb_range+0x18: undefined d50b7a20 >>> db> bt >>> Tracing pid 715 tid 100078 td 0xfffffd00007849c0 >>> db_trace_self() at db_stack_trace+0xf0 >>> pc = 0xffff000000605cc0 lr = 0xffff0000000840e0 >>> sp = 0xffff000065dd4cb0 fp = 0xffff000065dd4ce0 >>> >>> db_stack_trace() at db_command+0x23c >>> pc = 0xffff0000000840e0 lr = 0xffff000000083d58 >>> sp = 0xffff000065dd4cf0 fp = 0xffff000065dd4dd0 >>> >>> db_command() at db_command_loop+0x60 >>> pc = 0xffff000000083d58 lr = 0xffff000000083b00 >>> sp = 0xffff000065dd4de0 fp = 0xffff000065dd4e00 >>> >>> db_command_loop() at db_trap+0xf4 >>> pc = 0xffff000000083b00 lr = 0xffff000000086b34 >>> sp = 0xffff000065dd4e10 fp = 0xffff000065dd5030 >>> >>> db_trap() at kdb_trap+0x180 >>> pc = 0xffff000000086b34 lr = 0xffff00000035f650 >>> sp = 0xffff000065dd5040 fp = 0xffff000065dd50a0 >>> >>> kdb_trap() at data_abort+0x1a0 >>> pc = 0xffff00000035f650 lr = 0xffff00000061ae54 >>> sp = 0xffff000065dd50b0 fp = 0xffff000065dd5160 >>> >>> data_abort() at handle_el1h_sync+0x70 >>> pc = 0xffff00000061ae54 lr = 0xffff000000607870 >>> sp = 0xffff000065dd5170 fp = 0xffff000065dd5280 >>> >>> handle_el1h_sync() at pmap_remove_pages+0x2a8 >>> pc = 0xffff000000607870 lr = 0xffff0000006175d4 >>> sp = 0xffff000065dd5290 fp = 0xffff000065dd53c0 >>> >>> pmap_remove_pages() at exec_new_vmspace+0x1a4 >>> pc = 0xffff0000006175d4 lr = 0xffff0000002d9da0 >>> sp = 0xffff000065dd53d0 fp = 0xffff000065dd5430 >>> >>> exec_new_vmspace() at exec_elf64_imgact+0xa70 >>> pc = 0xffff0000002d9da0 lr = 0xffff0000002b7c14 >>> sp = 0xffff000065dd5440 fp = 0xffff000065dd5550 >>> >>> exec_elf64_imgact() at kern_execve+0x664 >>> pc = 0xffff0000002b7c14 lr = 0xffff0000002d8730 >>> sp = 0xffff000065dd5560 fp = 0xffff000065dd58b0 >>> >>> kern_execve() at sys_execve+0x54 >>> pc = 0xffff0000002d8730 lr = 0xffff0000002d7d90 >>> sp = 0xffff000065dd58c0 fp = 0xffff000065dd5930 >>> >>> sys_execve() at do_el0_sync+0xa48 >>> pc = 0xffff0000002d7d90 lr = 0xffff00000061b91c >>> sp = 0xffff000065dd5940 fp = 0xffff000065dd5a70 >>> >>> do_el0_sync() at handle_el0_sync+0x6c >>> pc = 0xffff00000061b91c lr = 0xffff0000006079e8 >>> sp = 0xffff000065dd5a80 fp = 0xffff000065dd5b90 >>> >>> handle_el0_sync() at 0x24a90 >>> pc = 0xffff0000006079e8 lr = 0x0000000000024a90 >>> sp = 0xffff000065dd5ba0 fp = 0x0000ffffffffe7d0 >>> >>> db> > > Because Konstanin B. was not Cc'd/To'd previously > I should have included the following background > information about how this was run on a > Odroid-C2 under UbuntuMate: > > qemu-system-aarch64 -m 1024M -enable-kvm -cpu host -machine virt \ > -bios QEMU_EFI.fd -nographic \ > -drive format=raw,if=none,file=FreeBSD-12.0-CURRENT-arm64-aarch64.raw,id=hd0 \ > -device virtio-blk-device,drive=hd0 \ > -device virtio-net-device,netdev=net0 \ > -netdev user,id=net0 \ > -smp cpus=4 > > based on: > > https://releases.linaro.org/components/kernel/uefi-linaro/16.02/release/qemu64/QEMU_EFI.fd > > and my build of head -r317015 turned into a .raw file. The code around handle_el1h_sync+0x70 : ffff000000607804 <handle_el1h_sync+0x4> sub sp, sp, #0x80 ffff000000607808 <handle_el1h_sync+0x8> sub sp, sp, #0x120 ffff00000060780c <handle_el1h_sync+0xc> stp x29, x30, [sp,#272] ffff000000607810 <handle_el1h_sync+0x10> stp x28, x29, [sp,#256] ffff000000607814 <handle_el1h_sync+0x14> stp x26, x27, [sp,#240] ffff000000607818 <handle_el1h_sync+0x18> stp x24, x25, [sp,#224] ffff00000060781c <handle_el1h_sync+0x1c> stp x22, x23, [sp,#208] ffff000000607820 <handle_el1h_sync+0x20> stp x20, x21, [sp,#192] ffff000000607824 <handle_el1h_sync+0x24> stp x18, x19, [sp,#176] ffff000000607828 <handle_el1h_sync+0x28> stp x16, x17, [sp,#160] ffff00000060782c <handle_el1h_sync+0x2c> stp x14, x15, [sp,#144] ffff000000607830 <handle_el1h_sync+0x30> stp x12, x13, [sp,#128] ffff000000607834 <handle_el1h_sync+0x34> stp x10, x11, [sp,#112] ffff000000607838 <handle_el1h_sync+0x38> stp x8, x9, [sp,#96] ffff00000060783c <handle_el1h_sync+0x3c> stp x6, x7, [sp,#80] ffff000000607840 <handle_el1h_sync+0x40> stp x4, x5, [sp,#64] ffff000000607844 <handle_el1h_sync+0x44> stp x2, x3, [sp,#48] ffff000000607848 <handle_el1h_sync+0x48> stp x0, x1, [sp,#32] ffff00000060784c <handle_el1h_sync+0x4c> mrs x10, elr_el1 ffff000000607850 <handle_el1h_sync+0x50> mrs x11, spsr_el1 ffff000000607854 <handle_el1h_sync+0x54> mrs x12, esr_el1 ffff000000607858 <handle_el1h_sync+0x58> str x10, [sp,#16] ffff00000060785c <handle_el1h_sync+0x5c> stp w11, w12, [sp,#24] ffff000000607860 <handle_el1h_sync+0x60> stp x18, x30, [sp] ffff000000607864 <handle_el1h_sync+0x64> mrs x18, tpidr_el1 ffff000000607868 <handle_el1h_sync+0x68> add x29, sp, #0x110 ffff00000060786c <handle_el1h_sync+0x6c> mov x0, sp ffff000000607870 <handle_el1h_sync+0x70> bl ffff00000061aad8 <do_el1h_sync> ffff000000607874 <handle_el1h_sync+0x74> msr daifset, #0x2 ffff000000607878 <handle_el1h_sync+0x78> ldp x18, x30, [sp] ffff00000060787c <handle_el1h_sync+0x7c> ldp x10, x11, [sp,#16] ffff000000607880 <handle_el1h_sync+0x80> msr spsr_el1, x11 ffff000000607884 <handle_el1h_sync+0x84> msr elr_el1, x10 ffff000000607888 <handle_el1h_sync+0x88> ldp x0, x1, [sp,#32] ffff00000060788c <handle_el1h_sync+0x8c> ldp x2, x3, [sp,#48] ffff000000607890 <handle_el1h_sync+0x90> ldp x4, x5, [sp,#64] ffff000000607894 <handle_el1h_sync+0x94> ldp x6, x7, [sp,#80] ffff000000607898 <handle_el1h_sync+0x98> ldp x8, x9, [sp,#96] ffff00000060789c <handle_el1h_sync+0x9c> ldp x10, x11, [sp,#112] ffff0000006078a0 <handle_el1h_sync+0xa0> ldp x12, x13, [sp,#128] ffff0000006078a4 <handle_el1h_sync+0xa4> ldp x14, x15, [sp,#144] ffff0000006078a8 <handle_el1h_sync+0xa8> ldp x16, x17, [sp,#160] ffff0000006078ac <handle_el1h_sync+0xac> ldr x29, [sp,#264] ffff0000006078b0 <handle_el1h_sync+0xb0> mov sp, x18 ffff0000006078b4 <handle_el1h_sync+0xb4> mrs x18, tpidr_el1 ffff0000006078b8 <handle_el1h_sync+0xb8> eret So the bl to do_el1h_sync apparently gets the data_abort. The code around pmap_remove_pages+0x2a8 : ffff000000617570 <pmap_remove_pages+0x244> bl ffff0000005cf83c <PHYS_TO_VM_PAGE> ffff000000617574 <pmap_remove_pages+0x248> ldr x9, [sp,#80] ffff000000617578 <pmap_remove_pages+0x24c> adrp x8, ffff000000bbd000 <pmc_tf+0x6210> ffff00000061757c <pmap_remove_pages+0x250> add x8, x8, #0x848 ffff000000617580 <pmap_remove_pages+0x254> str x0, [sp,#48] ffff000000617584 <pmap_remove_pages+0x258> cmp x9, x8 ffff000000617588 <pmap_remove_pages+0x25c> b.eq ffff0000006175a4 <pmap_remove_pages+0x278> ffff00000061758c <pmap_remove_pages+0x260> ldr x8, [x18] ffff000000617590 <pmap_remove_pages+0x264> ldr x8, [x8,#8] ffff000000617594 <pmap_remove_pages+0x268> ldr x8, [x8,#512] ffff000000617598 <pmap_remove_pages+0x26c> ldr x8, [x8,#224] ffff00000061759c <pmap_remove_pages+0x270> cmp x8, x9 ffff0000006175a0 <pmap_remove_pages+0x274> b.ne ffff0000006175d8 <pmap_remove_pages+0x2ac> ffff0000006175a4 <pmap_remove_pages+0x278> and x8, x22, #0x1f ffff0000006175a8 <pmap_remove_pages+0x27c> cmp x28, #0x3 ffff0000006175ac <pmap_remove_pages+0x280> b.ne ffff0000006175c4 <pmap_remove_pages+0x298> ffff0000006175b0 <pmap_remove_pages+0x284> cmp x8, #0xb ffff0000006175b4 <pmap_remove_pages+0x288> b.ne ffff0000006175d8 <pmap_remove_pages+0x2ac> ffff0000006175b8 <pmap_remove_pages+0x28c> ldr x0, [x24] ffff0000006175bc <pmap_remove_pages+0x290> orr w1, wzr, #0x1000 ffff0000006175c0 <pmap_remove_pages+0x294> b ffff0000006175d4 <pmap_remove_pages+0x2a8> ffff0000006175c4 <pmap_remove_pages+0x298> cmp x8, #0x9 ffff0000006175c8 <pmap_remove_pages+0x29c> b.ne ffff0000006175d8 <pmap_remove_pages+0x2ac> ffff0000006175cc <pmap_remove_pages+0x2a0> ldr x0, [x24] ffff0000006175d0 <pmap_remove_pages+0x2a4> orr w1, wzr, #0x200000 ffff0000006175d4 <pmap_remove_pages+0x2a8> bl ffff000000605884 <arm64_dcache_wb_range> ffff0000006175d8 <pmap_remove_pages+0x2ac> mov x8, xzr ffff0000006175dc <pmap_remove_pages+0x2b0> orr w1, wzr, #0x8 ffff0000006175e0 <pmap_remove_pages+0x2b4> mov x0, x26 ffff0000006175e4 <pmap_remove_pages+0x2b8> ldxr x9, [x26] ffff0000006175e8 <pmap_remove_pages+0x2bc> stxr w10, x8, [x26] ffff0000006175ec <pmap_remove_pages+0x2c0> cbnz w10, ffff0000006175e4 <pmap_remove_pages+0x2b8> ffff0000006175f0 <pmap_remove_pages+0x2c4> bl ffff000000605884 <arm64_dcache_wb_range> So this happens to involve arm64_dcache_wb_range (that has not started yet). === Mark Millard markmi at dsl-only.netReceived on Tue May 02 2017 - 19:59:58 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:11 UTC