On 23/04/2018 17:50, Julian Elischer wrote: > back trace at: http://www.freebsd.org/~julian/bob-crash.png > > If anyone wants to take a look.. > > In the exit syscall, while deallocating a vm object. > > I haven't see references to a similar crash in the last 10 days or so.. But if > it rings any bells... We have just got another one: panic: Bad link elm 0xfffff80cc3938360 prev->next != elm Matching disassembled code to C code, it seems that the crash is somewhere in vm_object_terminate_pages (inlined into vm_object_terminate), probably in one of TAILQ_REMOVE-s there: if (p->queue != PQ_NONE) { KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: " "page %p is not queued", p)); pq1 = vm_page_pagequeue(p); if (pq != pq1) { if (pq != NULL) { vm_pagequeue_cnt_add(pq, dequeued); vm_pagequeue_unlock(pq); } pq = pq1; vm_pagequeue_lock(pq); dequeued = 0; } p->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, p, plinks.q); dequeued--; } if (vm_page_free_prep(p, true)) continue; unlist: TAILQ_REMOVE(&object->memq, p, listq); } Please note that this is the code before r332974 Improve VM page queue scalability. I am not sure if r332974 + r333256 would fix the problem or if it just would get moved to a different place. Does this ring a bell to anyone who tinkered with that part of the VM code recently? Looking a little bit further, I think that object->memq somehow got corrupted. memq contains just two elements and the reported element is not there. (kgdb) p *(struct vm_page *)0xfffff80cc3938360 $22 = { plinks = { q = { tqe_next = 0xfffff80cd7175398, tqe_prev = 0xfffff80cb9f69170 }, s = { ss = { sle_next = 0xfffff80cd7175398 }, pv = 0xfffff80cb9f69170 }, memguard = { p = 18446735332764767128, v = 18446735332276081008 } }, listq = { tqe_next = 0xfffff80cc3938568, <============= tqe_prev = 0xfffff8078c11b848 <============= }, object = 0x0, pindex = 1548, phys_addr = 14695911424, md = { pv_list = { tqh_first = 0x0, tqh_last = 0xfffff80cc3938398 }, pv_gen = 1205766, pat_mode = 6 }, wire_count = 0, busy_lock = 1, hold_count = 0, flags = 0, aflags = 0 '\000', oflags = 0 '\000', queue = 255 '\377', psind = 0 '\000', segind = 5 '\005', order = 13 '\r', pool = 0 '\000', act_count = 5 '\005', valid = 0 '\000', dirty = 0 '\000' } (kgdb) p object->memq $11 = { tqh_first = 0xfffff80cb861cfb8, tqh_last = 0xfffff80cc3938780 } (kgdb) p *object->memq.tqh_first $25 = { plinks = { q = { tqe_next = 0xfffff80cb9f69108, tqe_prev = 0xfffff80cd7175398 }, s = { ss = { sle_next = 0xfffff80cb9f69108 }, pv = 0xfffff80cd7175398 }, memguard = { p = 18446735332276080904, v = 18446735332764767128 } }, listq = { tqe_next = 0xfffff80cb56eafb0, <============= tqe_prev = 0xfffff8078c11b848 <============= }, object = 0xfffff8078c11b800, pindex = 515, phys_addr = 7299219456, md = { pv_list = { tqh_first = 0xfffff80b99e4ff88, tqh_last = 0xfffff80b99e4ff90 }, pv_gen = 466177, pat_mode = 6 }, wire_count = 0, busy_lock = 2, hold_count = 0, flags = 0, aflags = 0 '\000', oflags = 0 '\000', queue = 255 '\377', psind = 0 '\000', segind = 5 '\005', order = 13 '\r', pool = 0 '\000', act_count = 5 '\005', valid = 255 '\377', dirty = 0 '\000' } (kgdb) p *object->memq.tqh_first->listq.tqe_next $26 = { plinks = { q = { tqe_next = 0x0, tqe_prev = 0xfffff80cc92e1d18 }, s = { ss = { sle_next = 0x0 }, pv = 0xfffff80cc92e1d18 }, memguard = { p = 0, v = 18446735332531379480 } }, listq = { tqe_next = 0x0, <============= tqe_prev = 0xfffff80cb861cfc8 <============= }, object = 0xfffff8078c11b800, pindex = 1548, phys_addr = 5350158336, md = { pv_list = { tqh_first = 0xfffff80a07222808, tqh_last = 0xfffff80a07222810 }, pv_gen = 7085, pat_mode = 6 }, wire_count = 0, busy_lock = 1, hold_count = 0, flags = 0, aflags = 1 '\001', oflags = 0 '\000', queue = 1 '\001', psind = 0 '\000', segind = 5 '\005', order = 13 '\r', pool = 0 '\000', act_count = 5 '\005', valid = 255 '\377', dirty = 255 '\377' } Pages 0xfffff80cc3938360 (the reported one) and 0xfffff80cb56eafb0 (the last one on memq) have the same index 1548. Also, memq.tqh_last points to the reported page, but it is not reachable via tqe_next pointers. It's also potentially interesting is that the reported page looks like it's already freed and the replacement page is both valid and dirty. The object, just in case: (kgdb) p *object $34 = { lock = { lock_object = { lo_name = 0xffffffff81202c27 "vm object", lo_flags = 627245056, lo_data = 0, lo_witness = 0xfffff80cffd6a700 }, rw_lock = 18446735286009226592 }, object_list = { tqe_next = 0xfffff80b2481e200, tqe_prev = 0xfffff80b2481e020 }, shadow_head = { lh_first = 0x0 }, shadow_list = { le_next = 0xfffff809c070f900, le_prev = 0xfffff80869c06c30 }, memq = { tqh_first = 0xfffff80cb861cfb8, tqh_last = 0xfffff80cc3938780 }, rtree = { rt_root = 18446735279843613792 }, size = 1561, domain = { dr_policy = 0x0, dr_iterator = 0 }, generation = 1, ref_count = 0, shadow_count = 0, memattr = 6 '\006', type = 0 '\000', flags = 12296, pg_color = 1809, paging_in_progress = 0, resident_page_count = 5, backing_object = 0x0, backing_object_offset = 0, pager_object_list = { tqe_next = 0x0, tqe_prev = 0x0 }, rvq = { lh_first = 0xfffff80cad278b60 }, handle = 0x0, un_pager = { vnp = { vnp_size = 19444, writemappings = 0 }, devp = { devp_pglist = { tqh_first = 0x4bf4, tqh_last = 0x0 }, ops = 0x0, dev = 0x0 }, sgp = { sgp_pglist = { tqh_first = 0x4bf4, tqh_last = 0x0 } }, swp = { swp_tmpfs = 0x4bf4, swp_blks = { pt_root = 0 } } }, cred = 0xfffff806811adc00, charge = 6393856, umtx_data = 0x0 } Interesting that it is on a shadow list. -- Andriy GaponReceived on Tue May 29 2018 - 11:50:19 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:16 UTC