Re: Fatal error 'mutex is on list' at line 139 in file /usr/src/lib/libthr/thread/thr_mutex.c (errno = 35)

From: Oleg V. Nauman <oleg_at_opentransfer.com> Date: Mon, 21 Mar 2016 12:15:15 +0200 · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:03 UTC

On Monday 21 March 2016 09:07:10 you wrote:
> On Mon, Mar 21, 2016 at 07:21:02AM +0200, Konstantin Belousov wrote:
> > On Sun, Mar 20, 2016 at 09:28:13AM +0200, Oleg V. Nauman wrote:
> > > Fatal error 'mutex 0x800632000 own 0x1885c 0x1885c is on list 0x0
> > > 0x80d46ebc0' at line 155 in file /usr/src/lib/libthr/thread/thr_mutex.c
> > > (errno = 2 )> > 
> > >  What I have got after applying this patch:
> > > #0  0x0000000805913d6a in thr_kill () from /lib/libc.so.7
> > > #1  0x0000000805913d3b in __raise (s=6) at
> > > /usr/src/lib/libc/gen/raise.c:52
> > > #2  0x0000000805913ca9 in abort () at
> > > /usr/src/lib/libc/stdlib/abort.c:65
> > > #3  0x0000000805639554 in _thread_exit (
> > > 
> > >     fname=0x80563ac36 "/usr/src/lib/libthr/thread/thr_mutex.c",
> > >     lineno=155,
> > >     msg=0x7fffffffd1c0 "mutex 0x800632000 own 0x1885c 0x1885c is on list
> > >     0x0
> > > 
> > > 0x80d46ebc0")
> > > 
> > >     at /usr/src/lib/libthr/thread/thr_exit.c:182
> > > 
> > > #4  0x000000080562fe36 in mutex_assert_not_owned (m=0x800632000)
> > > 
> > >     at /usr/src/lib/libthr/thread/thr_mutex.c:155
> > > 
> > > #5  0x0000000805630009 in enqueue_mutex (curthread=0x80cc15000,
> > > m=0x800632000)> > 
> > >     at /usr/src/lib/libthr/thread/thr_mutex.c:400
> > > 
> > > #6  0x0000000805631665 in mutex_lock_sleep (curthread=0x80cc15000,
> > > m=0x800632000,
> > > 
> > >     abstime=0x7fffffffd358) at
> > >     /usr/src/lib/libthr/thread/thr_mutex.c:535
> > > 
> > > #7  0x0000000805630280 in mutex_lock_common (m=0x800632000,
> > > abstime=0x7fffffffd358,
> > > 
> > >     cvattach=0) at /usr/src/lib/libthr/thread/thr_mutex.c:553
> > > 
> > > #8  0x000000080562f6be in __pthread_mutex_timedlock (mutex=0x810a00008,
> > > 
> > >     abstime=0x7fffffffd358) at
> > >     /usr/src/lib/libthr/thread/thr_mutex.c:583
> > > 
> > > ...
> > > gdb) f 6
> > > #6  0x0000000805631665 in mutex_lock_sleep (curthread=0x80cc15000,
> > > m=0x800632000,
> > > 
> > >     abstime=0x7fffffffd358) at
> > >     /usr/src/lib/libthr/thread/thr_mutex.c:535
> > > 
> > > 535                     enqueue_mutex(curthread, m);
> > > (gdb) p *curthread
> > > $1 = {tid = 100444, lock = {m_owner = 0, m_flags = 0, m_ceilings = {0,
> > > 0},
> > > m_spare = {0, 0,
> > > 
> > >       0, 0}}, cycle = 0, locklevel = 0, critical_count = 0, sigblock =
> > >       0, tle
> > > 
> > > = {
> > > 
> > >     tqe_next = 0x0, tqe_prev = 0x805841000 <_thread_list>}, gcle =
> > >     {tqe_next =
> > > 
> > > 0x0,
> > > 
> > >     tqe_prev = 0x0}, hle = {le_next = 0x0, le_prev = 0x80584b3c0}, wle =
> > > 
> > > {tqe_next = 0x0,
> > > 
> > >     tqe_prev = 0x0}, refcount = 0, start_routine = 0x0, arg = 0x0, attr
> > >     =
> > > 
> > > {sched_policy = 2,
> > > 
> > >     sched_inherit = 4, prio = 0, suspend = 0, flags = 258,
> > >     stackaddr_attr =
> > > 
> > > 0x7ffffdfff000,
> > > 
> > >     stacksize_attr = 33554432, guardsize_attr = 4096, cpuset = 0x0,
> > >     cpusetsize
> > > 
> > > = 0},
> > > 
> > >   cancel_enable = 1, cancel_pending = 0, cancel_point = 0, no_cancel =
> > >   0,
> > > 
> > > cancel_async = 0,
> > > 
> > >   cancelling = 0, sigmask = {__bits = {0, 0, 0, 0}}, unblock_sigcancel =
> > >   0,
> > >   in_sigsuspend = 0, deferred_siginfo = {si_signo = 0, si_errno = 0,
> > >   si_code => > 
> > > 0, si_pid = 0,
> > > 
> > >     si_uid = 0, si_status = 0, si_addr = 0x0, si_value = {sival_int = 0,
> > > 
> > > sival_ptr = 0x0,
> > > 
> > >       sigval_int = 0, sigval_ptr = 0x0}, _reason = {_fault = {_trapno =
> > >       0},
> > > 
> > > _timer = {
> > > 
> > >         _timerid = 0, _overrun = 0}, _mesgq = {_mqd = 0}, _poll = {_band
> > >         = 0},
> > > 
> > > __spare__ = {
> > > 
> > >         __spare1__ = 0, __spare2__ = {0, 0, 0, 0, 0, 0, 0}}}},
> > > 
> > > deferred_sigmask = {__bits = {
> > > 
> > >       0, 0, 0, 0}}, deferred_sigact = {__sigaction_u = {__sa_handler =
> > >       0x0,
> > >       __sa_sigaction = 0x0}, sa_flags = 0, sa_mask = {__bits = {0, 0, 0,
> > >       0}}},
> > >   
> > >   deferred_run = 0, force_exit = 0, state = PS_RUNNING, error = 0,
> > >   joiner =
> > > 
> > > 0x0, flags = 0,
> > > 
> > >   tlflags = 2, mq = {{tqh_first = 0x0, tqh_last = 0x80cc151a0},
> > >   {tqh_first =
> > > 
> > > 0x0,
> > > 
> > >       tqh_last = 0x80cc151b0}, {tqh_first = 0x0, tqh_last =
> > >       0x80cc151c0},
> > > 
> > > {tqh_first = 0x0,
> > > 
> > >       tqh_last = 0x80cc151d0}}, ret = 0x0, specific = 0x800631000,
> > > 
> > > specific_data_count = 5,
> > > 
> > >   rdlock_count = 0, rtld_bits = 0, tcb = 0x8006df108, cleanup = 0x0, ex
> > >   = {
> > >   
> > >     exception_class = 0, exception_cleanup = 0x0, private_1 = 0,
> > >     private_2 =
> > > 
> > > 0},
> > > 
> > >   unwind_stackend = 0x7ffffffff000, unwind_disabled = 0, magic =
> > >   3499860245,
> > >   report_events = 0, event_mask = 0, event_buf = {event = TD_EVENT_NONE,
> > >   th_p
> > > 
> > > = 0, data = 0},
> > > 
> > >   wchan = 0x0, mutex_obj = 0x0, will_sleep = 0, nwaiter_defer = 0,
> > > 
> > > defer_waiters = {
> > > 
> > >     0x0 <repeats 50 times>}, wake_addr = 0x80584b0c8, sleepqueue =
> > > 
> > > 0x80cc14040}
> > > 
> > > (gdb) f 8
> > > #8  0x000000080562f6be in __pthread_mutex_timedlock (mutex=0x810a00008,
> > > 
> > >     abstime=0x7fffffffd358) at
> > >     /usr/src/lib/libthr/thread/thr_mutex.c:583
> > > 
> > > 583                     ret = mutex_lock_common(m, abstime, 0);
> > > (gdb) p *mutex
> > > $2 = (pthread_mutex_t) 0x8000000000000001
> > > (gdb) p m
> > > $3 = (struct pthread_mutex *) 0x800632000
> > > (gdb) p *m
> > > $4 = {m_lock = {m_owner = 100444, m_flags = 1, m_ceilings = {0, 0},
> > > m_spare = {0, 0, 0, 0}},
> > > 
> > >   m_flags = 1, m_owner = 100444, m_count = 0, m_spinloops = 0,
> > >   m_yieldloops =
> > > 
> > > 0, m_qe = {
> > > 
> > >     tqe_next = 0x0, tqe_prev = 0x0}, m_pqe = {tqe_next = 0x0, tqe_prev =
> > >     0x0}}
> > 
> > Thank you, this is useful, but misterious. Below is my current
> > (mis-)understanding of the problem based on your debugging data. I wrote
> > this mostly to myself, and continue looking at it, but might be somebody
> > else is curious.
> > 
> > Note that there is inconsistency in the debugging output from libthr,
> > which reports that m->m_qe.tqe_next is not NULL (and it looks like a
> > reasonable pointer), while gdb reports that both pointers from m_qe are
> > correctly NULLs. The m_lock.m_owner and m_owner values are fine both
> > from libthr output and from gdb output. Additional interesting detail is
> > that m_qe.tqe_prev is NULL, which agrees with the order of zeroing in
> > mutex_init_link().
> > 
> > So it sounds as if dequeue_mutex() is executed after the m_lock.m_owner
> > relinguished ownership of the mutex. Visual code inspection does not
> > reveal such pathes, it would be huge bug anyway. I currently do not see
> > how it is possible at all. Either some additional code is somewhere, or
> > memory ordering is broken, or memory is corrupted.
> > 
> > That said, I will flush my patch queue for the libthr shortly, which
> > includes the fork/pshared fix you already tested. Ideally, I need the
> > minimal stand-alone binary or source which demostrates the problem. I
> > understand that it is hard to get with KDE.
> > 
> > As an aside question, what hardware is used to reproduce the assert ?
> 
> Please, use the stock libthr from r297141, and apply the following
> debugging kernel patch.  I am interested whether the messages appear
> in dmesg/console for your load.
> 
> diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c
> index 9474b0b..7718870 100644
> --- a/sys/kern/kern_umtx.c
> +++ b/sys/kern/kern_umtx.c
> _at__at_ -3629,6 +3629,7 _at__at_ umtx_shm_create_reg(struct thread *td, const struct
> umtx_key *key,
> 
>  	reg = umtx_shm_find_reg(key);
>  	if (reg != NULL) {
> +printf("pid %d creating existing key 1\n", td->td_proc->p_pid);
>  		*res = reg;
>  		return (0);
>  	}
> _at__at_ -3650,6 +3651,7 _at__at_ umtx_shm_create_reg(struct thread *td, const struct
> umtx_key *key, if (reg1 != NULL) {
>  		mtx_unlock(&umtx_shm_lock);
>  		umtx_shm_free_reg(reg);
> +printf("pid %d creating existing key 2\n", td->td_proc->p_pid);
>  		*res = reg1;
>  		return (0);
>  	}

 OK, but please take a look what I have found ( it makes me thinking that 
problem is within the compiled KDE code ):
 The failure point within the KDE code is the same ( at least it is true for 
coredumps generated today ):

#7  0x0000000805a2f6be in __pthread_mutex_timedlock (mutex=0x81b200008, 
    abstime=0x7fffffffd458) at /usr/src/lib/libthr/thread/thr_mutex.c:583
#8  0x000000080443c4b0 in pthreadTimedLock::lock (this=0x81777b680)
    at 
/usr/ports/x11/kdelibs4/work/kdelibs-4.14.3/kdecore/util/kshareddatacache_p.h:252
....
(gdb) f 8
#8  0x000000080443c4b0 in pthreadTimedLock::lock (this=0x81777b680)
    at 
/usr/ports/x11/kdelibs4/work/kdelibs-4.14.3/kdecore/util/kshareddatacache_p.h:252
252             return pthread_mutex_timedlock(&m_mutex, &timeout) == 0;
(gdb) p &m_mutex
$1 = (pthread_mutex_t *) 0x81b200008
(gdb) p m_mutex
$2 = (pthread_mutex_t &) _at_0x81b200008: 0x8000000000000001
(gdb) p &timeout
$3 = (timespec *) 0x6
(gdb) p timeout
Cannot access memory at address 0x6
(gdb)

It seems that both m_mutex and timeout are wrong

The class which generates coredumps looks like:

#if defined(KSDC_THREAD_PROCESS_SHARED_SUPPORTED) && 
defined(KSDC_TIMEOUTS_SUPPORTED)
class pthreadTimedLock : public pthreadLock
{
public:
    pthreadTimedLock(pthread_mutex_t &mutex)
        : pthreadLock(mutex)
    {
    }

    virtual bool lock()
    {
        struct timespec timeout;

        // Long timeout, but if we fail to meet this timeout it's probably a 
cache
        // corruption (and if we take 8 seconds then it should be much much 
quicker
        // the next time anyways since we'd be paged back in from disk)
        timeout.tv_sec = 10 + ::time(NULL); // Absolute time, so 10 seconds 
from now
        timeout.tv_nsec = 0;

        return pthread_mutex_timedlock(&m_mutex, &timeout) == 0;
    }
};
#endif

It is called by:

(gdb) f 9
#9  0x000000080443c8a8 in KSharedDataCache::Private::CacheLocker::cautiousLock 
(
    this=0x7fffffffd5f0)
    at 
/usr/ports/x11/kdelibs4/work/kdelibs-4.14.3/kdecore/util/kshareddatacache.cpp:1259
1259                while (!d->lock() && !isLockedCacheSafe()) {
gdb) p *d
$4 = {m_cacheName = {static null = {<No data fields>}, static shared_null = 
{ref = {
        _q_value = 2731}, alloc = 0, size = 0, data = 0x6192ca 
<QString::shared_null+26>,
      clean = 0, simpletext = 0, righttoleft = 0, asciiCache = 0, capacity = 
0, reserved = 0,
      array = {0}}, static shared_empty = {ref = {_q_value = 50}, alloc = 0, 
size = 0,
      data = 0x805105c3a <QString::shared_empty+26>, clean = 0, simpletext = 
0,
      righttoleft = 0, asciiCache = 0, capacity = 0, reserved = 0, array = 
{0}},
    d = 0x8176e8180, static codecForCStrings = 0x0}, shm = 0x81b200000,
  m_lock = {<QtSharedPointer::ExternalRefCount<KSDCLock>> = 
{<QtSharedPointer::Basic<KSDCLock>> = {value = 0x81777b680}, d = 0x81777b6c0}, 
<No data fields>}, m_mapSize = 10547304,
  m_defaultCacheSize = 10485760, m_expectedItemSize = 0, m_expectedType = 
LOCKTYPE_MUTEX}
(gdb) p d
$5 = (KSharedDataCache::Private *) 0x8176d2030

Well I understand that unwinding the KDE code it is a task not for humans..

The hardware is ASUS X552C notebook, Ivybridge, amd64
I noticed massive coredumps after x11/kdelibs4 recompilation with clang 3.8.0 
so it is possible that it is a problem with code generation.
It is does not depend on optimization level ( at least it exhibits the same 
behavior for both -O2 and -O0 )
The only CPU/optimization/code generation specific setting is
CPUTYPE?=nehalem
in make.conf

Thank you