Re: svn commit: r360233 - in head: contrib/jemalloc . . . : This partially breaks a 2-socket 32-bit powerpc (old PowerMac G4) based on head -r360311

From: Mark Millard <marklmi_at_yahoo.com> Date: Sun, 3 May 2020 20:35:35 -0700 · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:23 UTC

This note just reports things from looking at 2
.core files (mountd and rpcbind) from the new
jemalloc context's failures. May be someone that
knows more can get something out of it. I've
not included any of the prior message history.

For mountd:

Core was generated by `/usr/sbin/mountd -r'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x50235df0 in cache_bin_dalloc_easy (bin=<optimized out>, bin_info=<optimized out>, ptr=0x50049160) at /usr/src/contrib/jemalloc/include/jemalloc/internal/cache_bin.h:121

warning: Source file is more recent than executable.
121		if (unlikely(bin->ncached == bin_info->ncached_max)) {

It turns out that bin_info traces back to:

        cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
        cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
        if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
                return false;
        }

based on:

#define tcache_bin_info JEMALLOC_N(tcache_bin_info)
and:
#  define JEMALLOC_N(n) __je_##n

But gdb reports:

(gdb) print __je_tcache_bin_info
$3 = (cache_bin_info_t *) 0x0

(gdb) print alloc_ctx
$1 = {szind = 0, slab = <synthetic pointer>}

so bin_info = NULL and bin_info->ncached_max would
fail (and does).

By contrast, bin->ncached seems to be from:

(gdb) print *(cache_bin_t*)0x50094018
$6 = {low_water = 65536, ncached = 1, tstats = {nrequests = 4796680610075595179}, avail = 0x0}

For rpcbind:

Core was generated by `/usr/sbin/rpcbind'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x50243fec in rendezvous_request (xprt=<optimized out>, msg=<optimized out>) at /usr/src/lib/libc/rpc/svc_vc.c:323
323			newxprt->xp_raddr = *(struct sockaddr_in *)newxprt->xp_rtaddr.buf;

(gdb) print *newxprt
$5 = {xp_fd = 14, xp_port = 0, xp_ops = 0x50329e1c, xp_addrlen = 0, xp_raddr = {sin_len = 0 '\000', sin_family = 0 '\000', sin_port = 0, sin_addr = {s_addr = 0}, 
    sin_zero = "\000\000\000\000P1O\374"}, xp_ops2 = 0x50329e34, xp_tp = 0x0, xp_netid = 0x50047310 "unix", xp_ltaddr = {maxlen = 1345322064, len = 1970170232, buf = 0x2020}, xp_rtaddr = {
    maxlen = 268500992, len = 16, buf = 0x0}, xp_verf = {oa_flavor = 0, oa_base = 0x202d2020 <error: Cannot access memory at address 0x202d2020>, oa_length = 538976364}, xp_p1 = 0x6f6f7000, 
  xp_p2 = 0x0, xp_p3 = 0x202d2020, xp_type = 538976288}

so newxprt->xp_rtaddr.buf == 0x0 . But taht is very odd . . .

        /*
         * make a new transporter (re-uses xprt)
         */
        newxprt = makefd_xprt(sock, r->sendsize, r->recvsize);
        newxprt->xp_rtaddr.buf = mem_alloc(len);
        if (newxprt->xp_rtaddr.buf == NULL)
                return (FALSE);
        memcpy(newxprt->xp_rtaddr.buf, &addr, len);
        newxprt->xp_rtaddr.len = len;
#ifdef PORTMAP
        if (addr.ss_family == AF_INET || addr.ss_family == AF_LOCAL) {
                newxprt->xp_raddr = *(struct sockaddr_in *)newxprt->xp_rtaddr.buf;
                newxprt->xp_addrlen = sizeof (struct sockaddr_in);
        }
#endif                          /* PORTMAP */

Or, in machine code terms:

   0x50243f90 <+260>:	mr      r28,r3
   0x50243f94 <+264>:	lwz     r4,0(r24)
   0x50243f98 <+268>:	lwz     r5,4(r24)
   0x50243f9c <+272>:	mr      r3,r28
   0x50243fa0 <+276>:	bl      0x5024308c <makefd_xprt>
   0x50243fa4 <+280>:	lwz     r27,36(r1)
   0x50243fa8 <+284>:	mr      r29,r3
   0x50243fac <+288>:	li      r3,1
   0x50243fb0 <+292>:	mr      r4,r27
   0x50243fb4 <+296>:	bl      0x502e3214 <00000000.plt_pic32.calloc>
   0x50243fb8 <+300>:	cmplwi  r3,0
   0x50243fbc <+304>:	stw     r3,64(r29)
   0x50243fc0 <+308>:	beq     0x50244160 <rendezvous_request+724>

Note: getting here means that newxprt->xp_rtaddr.buf
ws not NULL . Also the value was stored to 64(r29).

   0x50243fc4 <+312>:	addi    r4,r1,296
   0x50243fc8 <+316>:	mr      r5,r27
   0x50243fcc <+320>:	bl      0x502e3264 <00000000.plt_pic32.memcpy>

Note: getting here means that memcpy was able to
store where ewxprt->xp_rtaddr.buf indicated (as
the r3 value).

   0x50243fd0 <+324>:	lbz     r3,297(r1)
   0x50243fd4 <+328>:	stw     r27,60(r29)
   0x50243fd8 <+332>:	addi    r3,r3,-1
   0x50243fdc <+336>:	clrlwi  r3,r3,24
   0x50243fe0 <+340>:	cmplwi  r3,1
   0x50243fe4 <+344>:	bgt     0x50244014 <rendezvous_request+392>
   0x50243fe8 <+348>:	lwz     r3,64(r29)
=> 0x50243fec <+352>:	lwz     r4,0(r3)

Note: the failure was above with r3==0x0:

(gdb) info reg
r0             0x50243fb8          1344552888
r1             0xffffb400          4294947840
r2             0x500a1018          1342836760
r3             0x0                 0
r4             0xffffb538          4294948152
r5             0x10                16
r6             0x50047328          1342468904
r7             0x0                 0
r8             0x50047324          1342468900
r9             0x0                 0
r10            0x20                32
r11            0x502d691c          1345153308
r12            0x24200880          606079104
r13            0x0                 0
r14            0x0                 0
r15            0xffffbc28          4294949928
r16            0x10002848          268445768
r17            0x10040000          268697600
r18            0x2                 2
r19            0x0                 0
r20            0x1                 1
r21            0x5004c044          1342488644
r22            0xffffb63c          4294948412
r23            0x80                128
r24            0x50048010          1342472208
r25            0x14                20
r26            0xffffb630          4294948400
r27            0x10                16
r28            0xe                 14
r29            0x500472e0          1342468832
r30            0x5030112c          1345327404
r31            0x10040000          268697600
pc             0x50243fec          0x50243fec <rendezvous_request+352>
msr            <unavailable>
cr             0x84200080          2216689792
lr             0x50243fd0          0x50243fd0 <rendezvous_request+324>
ctr            0x0                 0
xer            0x0                 0
fpscr          0x0                 0
vscr           <unavailable>
vrsave         <unavailable>

Note: The rest of the 2 assignments would have been
done by:

   0x50243ff0 <+356>:	stw     r4,16(r29)
   0x50243ff4 <+360>:	lwz     r4,4(r3)
   0x50243ff8 <+364>:	stw     r4,20(r29)
   0x50243ffc <+368>:	lwz     r4,8(r3)
   0x50244000 <+372>:	stw     r4,24(r29)
   0x50244004 <+376>:	li      r4,16
   0x50244008 <+380>:	lwz     r3,12(r3)
   0x5024400c <+384>:	stw     r4,12(r29)
   0x50244010 <+388>:	stw     r3,28(r29)

My only guesses for alternatives are:

A) A processor context switch where the other
processor did not (yet) see the value stored
via 64(r29) and so it used an older value.

B) Something trashed the memory at 64(r29)
after it was updated by the code above.

C) r29's value was trashed by something,
changing where 64(r29) referenced.

D) r3 was trashed between the two back-to-back
instructions:

   0x50243fe8 <+348>:	lwz     r3,64(r29)
=> 0x50243fec <+352>:	lwz     r4,0(r3)

E) ???

I've no clue which.

Both .core files seem to have zero's showing up in
unexpected places previously non-zero.

===
Mark Millard
marklmi at yahoo.com
( dsl-only.net went
away in early 2018-Mar)