Hello, hope you're having a nice day, Revisions mentioned are those which were tested by me; r195849+ has the corruption padded somewhere else so it might produce a panic with a different set of options. For reference, my test kernel uses a GENERIC config from May 09 snapshot without WITNESS and with IPFIREWALL, IPFIREWALL_DEFAULT_TO_ACCEPT and DEVICE_POLLING enabled. If someone experiences fatal traps under network load, and has the kernel compiled with "options INVARIANTS", here's a patch to check if you're suffering the memory corruption in netisr's DPCPU area. I'm pretty interested in the backtraces this panic() call will produce. Please note: with this patch, your system - if affected by the aforementioned problem - will trap *almost immediately* after the corruption happens, while a non-patched system can survive that for a bit if by the time v is assigned mtx_lock's value in _mtx_sleep_flags() the value is already fixed by a concurrent thread. I highly recommend having a backup 'normal kernel'. DO NOT apply this patch on a system you can't access via local (and/or IPMI/LOM) or serial console. For systems without INVARIANTS (although I'm not sure if the issue affects non-INVARIANTS builds) you can replace KASSERT(!(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)),("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", ((long long unsigned)foo), (mycpuid))); with if(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)) panic("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", (long long unsigned)foo, mycpuid); You can also replace the panic() call with a simple printf(), and it will just print out the warning message, but that won't really help debugging the cause, although will check if you're getting a fatal trap because of that aforementioned corruption. I will be really grateful if someone can help me debug this issue, namely, point me at how can I trace the problem to a thread/system call which could have messed with the memory area in question. -- Kamigishi Rei KREI-RIPE Index: sys/kern/kern_synch.c =================================================================== --- sys/kern/kern_synch.c (revision 195848) +++ sys/kern/kern_synch.c (working copy) _at__at_ -71,6 +71,8 _at__at_ #include <vm/pmap.h> #endif +#include <net/netisr.h> + #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ _at__at_ -391,7 +393,18 _at__at_ uint64_t runtime, new_switchtime; struct thread *td; struct proc *p; + struct netisr_workstream *nwsp; + uintptr_t foo; + unsigned int mycpuid; + for (mycpuid = 0; mycpuid < mp_maxid; mycpuid++) { + nwsp = DPCPU_ID_PTR(mycpuid, nws); + if (mtx_initialized(&(nwsp->nws_mtx))) { + foo = nwsp->nws_mtx.mtx_lock; + KASSERT(!(((foo & 0x8000000000000000)==0x0) && (foo != MTX_UNOWNED)),("mi_switch: DPCPU sanity checks: netisr workstream mutex nws_mtx contains an invalid pointer %llx in mtx_lock; this will lead to a page fault (cpuid: %u). Terminating.\n", ((long long unsigned)foo), (mycpuid))); + } + } + td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); p = td->td_proc; /* XXX */ Index: sys/net/netisr.c =================================================================== --- sys/net/netisr.c (revision 195848) +++ sys/net/netisr.c (working copy) _at__at_ -203,8 +203,6 _at__at_ u_int np_policy; /* Work placement policy. */ }; -#define NETISR_MAXPROT 16 /* Compile-time limit. */ - /* * The np array describes all registered protocols, indexed by protocol * number. _at__at_ -212,53 +210,6 _at__at_ static struct netisr_proto np[NETISR_MAXPROT]; /* - * Protocol-specific work for each workstream is described by struct - * netisr_work. Each work descriptor consists of an mbuf queue and - * statistics. - */ -struct netisr_work { - /* - * Packet queue, linked by m_nextpkt. - */ - struct mbuf *nw_head; - struct mbuf *nw_tail; - u_int nw_len; - u_int nw_qlimit; - u_int nw_watermark; - - /* - * Statistics -- written unlocked, but mostly from curcpu. - */ - u_int64_t nw_dispatched; /* Number of direct dispatches. */ - u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ - u_int64_t nw_qdrops; /* "" drops. */ - u_int64_t nw_queued; /* "" enqueues. */ - u_int64_t nw_handled; /* "" handled in worker. */ -}; - -/* - * Workstreams hold a set of ordered work across each protocol, and are - * described by netisr_workstream. Each workstream is associated with a - * worker thread, which in turn is pinned to a CPU. Work associated with a - * workstream can be processd in other threads during direct dispatch; - * concurrent processing is prevented by the NWS_RUNNING flag, which - * indicates that a thread is already processing the work queue. - */ -struct netisr_workstream { - struct intr_event *nws_intr_event; /* Handler for stream. */ - void *nws_swi_cookie; /* swi(9) cookie for stream. */ - struct mtx nws_mtx; /* Synchronize work. */ - u_int nws_cpu; /* CPU pinning. */ - u_int nws_flags; /* Wakeup flags. */ - u_int nws_pendingbits; /* Scheduled protocols. */ - - /* - * Each protocol has per-workstream data. - */ - struct netisr_work nws_work[NETISR_MAXPROT]; -} __aligned(CACHE_LINE_SIZE); - -/* * Per-CPU workstream data. */ DPCPU_DEFINE(struct netisr_workstream, nws); Index: sys/net/netisr.h =================================================================== --- sys/net/netisr.h (revision 195848) +++ sys/net/netisr.h (working copy) _at__at_ -111,6 +111,60 _at__at_ }; /* + * Protocol-specific work for each workstream is described by struct + * netisr_work. Each work descriptor consists of an mbuf queue and + * statistics. + */ +struct netisr_work { + /* + * Packet queue, linked by m_nextpkt. + */ + struct mbuf *nw_head; + struct mbuf *nw_tail; + u_int nw_len; + u_int nw_qlimit; + u_int nw_watermark; + + /* + * Statistics -- written unlocked, but mostly from curcpu. + */ + u_int64_t nw_dispatched; /* Number of direct dispatches. */ + u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ + u_int64_t nw_qdrops; /* "" drops. */ + u_int64_t nw_queued; /* "" enqueues. */ + u_int64_t nw_handled; /* "" handled in worker. */ +}; + +#define NETISR_MAXPROT 16 /* Compile-time limit. */ + +/* + * Workstreams hold a set of ordered work across each protocol, and are + * described by netisr_workstream. Each workstream is associated with a + * worker thread, which in turn is pinned to a CPU. Work associated with a + * workstream can be processd in other threads during direct dispatch; + * concurrent processing is prevented by the NWS_RUNNING flag, which + * indicates that a thread is already processing the work queue. + */ +struct netisr_workstream { + struct intr_event *nws_intr_event; /* Handler for stream. */ + void *nws_swi_cookie; /* swi(9) cookie for stream. */ + struct mtx nws_mtx; /* Synchronize work. */ + u_int nws_cpu; /* CPU pinning. */ + u_int nws_flags; /* Wakeup flags. */ + u_int nws_pendingbits; /* Scheduled protocols. */ + + /* + * Each protocol has per-workstream data. + */ + struct netisr_work nws_work[NETISR_MAXPROT]; +} __aligned(CACHE_LINE_SIZE); + +/* + * Declare per-CPU workstream data globally + */ +DPCPU_DECLARE(struct netisr_workstream, nws); + +/* * Register, unregister, and other netisr handler management functions. */ void netisr_clearqdrops(const struct netisr_handler *nhp);Received on Tue Jul 28 2009 - 12:24:54 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:39:52 UTC